# Chargement du jeu de données

In [33]:
import os
from getpass import getpass

cache_dir = input("Indicate path to all Hugging Face caches:")
os.environ["HF_DATASETS_CACHE"] = cache_dir
os.environ["HF_HUB_CACHE"] = cache_dir
os.environ["HF_TOKEN"] = getpass("Enter your HuggingFace token:")

In [None]:
from rank_comparia.utils import load_comparia

reactions = load_comparia("ministere-culture/comparia-reactions")

## Calcul des scores

On calcule des scores comme dans le notebook `rankers.ipynb`.

In [35]:
from rank_comparia.data_transformation import get_matches_with_score

matches = get_matches_with_score(reactions)

In [36]:
matches.head(5)

model_a_name,model_b_name,conversation_pair_id,score_a,score_b
str,str,str,i64,i64
"""llama-3.1-8b""","""llama-3.3-70b""","""06b9efda752643ebb0565cb78c1f46…",0,6
"""claude-3-5-sonnet-v2""","""gpt-4o-2024-08-06""","""24e7eb51020f4411b687749f79a1b3…",2,0
"""llama-3.3-70b""","""llama-3.1-nemotron-70b-instruc…","""10dd55a4982348df8a210b732e0962…",-1,-2
"""deepseek-v3-chat""","""gemini-1.5-pro-002""","""8660169dd3904f42a2acfa5b352769…",3,0
"""llama-3.1-405b""","""llama-3.1-nemotron-70b-instruc…","""9523c20b4efa4dcb895204583e5d6e…",0,1


In [37]:
from rank_comparia.elo import ELORanker
from rank_comparia.ranker import Match, MatchScore
import random


def compute_match_score(score_a: int, score_b: int) -> MatchScore:
    final_score = score_b - score_a
    if final_score > 0:
        return MatchScore.B
    elif final_score < 0:
        return MatchScore.A
    else:
        return MatchScore.Draw


def get_shuffled_results(matches: list[Match], model_names: list[str], seed: int = 0):
    random.seed(seed)
    ranker_shuffle = ELORanker(K=40)
    matches_shuffle = random.sample(matches, k=len(matches))
    ranker_shuffle.add_players(model_names)
    ranker_shuffle.compute_scores(matches=matches_shuffle)
    return ranker_shuffle.players

In [38]:
model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())
matches = [
    Match(
        match_dict["model_a_name"],
        match_dict["model_b_name"],
        compute_match_score(match_dict["score_a"], match_dict["score_b"]),
    )
    for match_dict in matches.to_dicts()
]

In [39]:
ranker = ELORanker(K=40)

random.seed(1337)
matches = random.sample(matches, k=len(matches))
ranker.add_players(model_names)  # type: ignore
ranker.compute_scores(matches=matches)
ranker.get_scores()

{'deepseek-v3-chat': 1212.6624236322593,
 'claude-3-7-sonnet': 1135.0979653746404,
 'gemini-2.0-flash-001': 1127.6491187502902,
 'command-a': 1109.0736501865697,
 'gemma-3-27b': 1101.940135903943,
 'gemini-1.5-pro-002': 1100.4153564892965,
 'mistral-large-2411': 1094.3219154801855,
 'grok-3-mini-beta': 1093.7771261344612,
 'gemini-2.0-flash-exp': 1079.9107468559318,
 'gemma-3-4b': 1074.074137515472,
 'gemini-1.5-pro-001': 1073.9973563665326,
 'mistral-saba': 1073.6896122095982,
 'gemma-3-12b': 1072.9365403748707,
 'gpt-4.1-mini': 1066.4007451068437,
 'o4-mini': 1057.601475936693,
 'deepseek-v3-0324': 1054.3763448653876,
 'gpt-4o-mini-2024-07-18': 1051.529204861027,
 'llama-3.1-nemotron-70b-instruct': 1045.6443800662353,
 'llama-4-scout': 1044.42804026603,
 'deepseek-r1-distill-llama-70b': 1039.4057516599937,
 'deepseek-r1': 1032.3990428270208,
 'qwen2.5-7b-instruct': 1027.4715716462263,
 'gemma-2-27b-it-q8': 1016.0217958881971,
 'qwq-32b': 1014.3174253618948,
 'o3-mini': 1013.354731842

## Calcul d'un score de frugalité

Le score de frugalité est calculé à partir de données de consommation présentes dans le jeu de données `comparia-conversations`.

Number of matches per model:

In [40]:
import polars as pl
from rank_comparia.frugality import get_n_match, get_models_output_tokens

number_by_model = get_n_match(ranker)
total_tokens = get_models_output_tokens(reactions)

number_by_model = number_by_model.join(total_tokens, on="model_name")

number_by_model

model_name,n_match,total_output_tokens
str,i64,f64
"""aya-expanse-8b""",519,961445.0
"""c4ai-command-r-08-2024""",1180,5.944341e6
"""chocolatine-2-14b-instruct-v2.…",598,511086.0
"""claude-3-5-sonnet-v2""",1994,2.818225e6
"""claude-3-7-sonnet""",319,2.162331e6
…,…,…
"""phi-3.5-mini-instruct""",462,851005.0
"""phi-4""",1616,3.086417e6
"""qwen2.5-7b-instruct""",427,870902.0
"""qwen2.5-coder-32b-instruct""",1641,4.787749e6


In [41]:
from rank_comparia.frugality import calculate_frugality_score, draw_chart

frugal_scores = calculate_frugality_score(reactions, number_by_model, mean=True)

frugal_scores

model_name,conso_all_conv,n_match,total_output_tokens,mean_conso_per_match,mean_conso_per_token
str,f64,i64,f64,f64,f64
"""aya-expanse-8b""",3.62259,519,961445.0,0.00698,0.000004
"""c4ai-command-r-08-2024""",44.921088,1180,5.944341e6,0.038069,0.000008
"""chocolatine-2-14b-instruct-v2.…",1.853976,598,511086.0,0.0031,0.000004
"""claude-3-5-sonnet-v2""",378.314297,1994,2.818225e6,0.189726,0.000134
"""claude-3-7-sonnet""",290.26807,319,2.162331e6,0.909931,0.000134
…,…,…,…,…,…
"""phi-3.5-mini-instruct""",2.609332,462,851005.0,0.005648,0.000003
"""phi-4""",14.228012,1616,3.086417e6,0.008804,0.000005
"""qwen2.5-7b-instruct""",3.159217,427,870902.0,0.007399,0.000004
"""qwen2.5-coder-32b-instruct""",34.16509,1641,4.787749e6,0.02082,0.000007


In [42]:
elo_scores = pl.DataFrame(
    {
        "model_name": ranker.players.keys(),
        "elo_score": ranker.players.values(),
    },
    strict=False,
).sort(by="elo_score", descending=True)

elo_scores

model_name,elo_score
str,f64
"""deepseek-v3-chat""",1212.662424
"""claude-3-7-sonnet""",1135.097965
"""gemini-2.0-flash-001""",1127.649119
"""command-a""",1109.07365
"""gemma-3-27b""",1101.940136
…,…
"""mixtral-8x22b-instruct-v0.1""",862.715044
"""mixtral-8x7b-instruct-v0.1""",857.498403
"""mistral-nemo-2407""",849.105731
"""chocolatine-2-14b-instruct-v2.…",810.327547


In [43]:
from pathlib import Path

info_model = pl.read_json(source=Path(".").resolve().parent / "data" / "models_data.json")

In [44]:
final_df = info_model.join(elo_scores, on="model_name").join(frugal_scores, on="model_name")

final_df

name,model_name,organization,license,elo_score,conso_all_conv,n_match,total_output_tokens,mean_conso_per_match,mean_conso_per_token
str,str,str,str,f64,f64,i64,f64,f64,f64
"""Aya-Expanse-8B""","""aya-expanse-8b""","""Cohere""","""CC-BY-NC-4.0""",964.474083,3.62259,519,961445.0,0.00698,0.000004
"""Command R (08-2024)""","""c4ai-command-r-08-2024""","""Cohere""","""CC-BY-NC-4.0""",964.449303,44.921088,1180,5.944341e6,0.038069,0.000008
"""Chocolatine-2-14b Instruct""","""chocolatine-2-14b-instruct-v2.…","""jpacifico (individual)""","""Apache 2.0""",810.327547,1.853976,598,511086.0,0.0031,0.000004
"""Claude 3.5 Sonnet V2""","""claude-3-5-sonnet-v2""","""Anthropic""","""Proprietary""",929.894009,378.314297,1994,2.818225e6,0.189726,0.000134
"""Command A""","""command-a""","""Cohere""","""CC-BY-NC-4.0""",1109.07365,18.815316,457,1.03253e6,0.041171,0.000018
…,…,…,…,…,…,…,…,…,…
"""Phi-3.5 Mini Instruct""","""phi-3.5-mini-instruct""","""Microsoft""","""MIT""",909.035838,2.609332,462,851005.0,0.005648,0.000003
"""Phi 4""","""phi-4""","""Microsoft""","""MIT""",968.730594,14.228012,1616,3.086417e6,0.008804,0.000005
"""Qwen2.5-7B""","""qwen2.5-7b-instruct""","""Alibaba""","""Apache 2.0""",1027.471572,3.159217,427,870902.0,0.007399,0.000004
"""Qwen2.5-Coder-32B-Instruct""","""qwen2.5-coder-32b-instruct""","""Alibaba""","""Apache 2.0""",944.144284,34.16509,1641,4.787749e6,0.02082,0.000007


# Test with more values

In [45]:
draw_chart(final_df, title="", log=True, scale="token", mean=False)