# Chargement du jeu de données

In [1]:
import os
from getpass import getpass

cache_dir = input("Indicate path to all Hugging Face caches:")
os.environ["HF_DATASETS_CACHE"] = cache_dir
os.environ["HF_HUB_CACHE"] = cache_dir
os.environ["HF_TOKEN"] = getpass("Enter your HuggingFace token:")

In [2]:
from rank_comparia.utils import load_comparia

reactions = load_comparia("ministere-culture/comparia-reactions")

Using the latest cached version of the dataset since ministere-culture/comparia-reactions couldn't be found on the Hugging Face Hub (offline mode is enabled).
Found the latest cached dataset configuration 'default' at /home/jupyterhub-users/shared/projet_comparia/huggingface_hub/ministere-culture___comparia-reactions/default/0.0.0/92a324c10228176065909b52bbbaa16430e64c5a (last modified on Wed Jun  4 17:40:33 2025).
Using the latest cached version of the dataset since ministere-culture/comparia-conversations couldn't be found on the Hugging Face Hub (offline mode is enabled).
Found the latest cached dataset configuration 'default' at /home/jupyterhub-users/shared/projet_comparia/huggingface_hub/ministere-culture___comparia-conversations/default/0.0.0/dc40af6af1c14e68bf39d55f6e1573d2d6582f19 (last modified on Wed Jun  4 17:40:30 2025).


## Calcul des scores

On calcule des scores comme dans le notebook `rankers.ipynb`.

In [3]:
from rank_comparia.data_transformation import get_matches_with_score

matches = get_matches_with_score(reactions)

In [4]:
matches.head(5)

model_a_name,model_b_name,conversation_pair_id,score_a,score_b
str,str,str,i64,i64
"""llama-3.1-405b""","""deepseek-v3-chat""","""fa310c827c8742f8aac9c3a0b80684…",0,4
"""llama-3.1-8b""","""mixtral-8x22b-instruct-v0.1""","""74265ff06f26405c9854ba82cb017e…",2,-1
"""claude-3-5-sonnet-v2""","""llama-3.1-70b""","""95dc4d4aeb9c4ab29227142ea6c8b3…",-1,0
"""gemini-2.0-flash-exp""","""qwen2.5-coder-32b-instruct""","""278e1b61621c47a68cc41b14ef1621…",2,-2
"""llama-3.3-70b""","""gemini-2.0-flash-001""","""a2de15a249b54200a4c6aa8f14fd48…",-2,1


In [5]:
from rank_comparia.elo import ELORanker
from rank_comparia.ranker import Match, MatchScore
import random


def compute_match_score(score_a: int, score_b: int) -> MatchScore:
    final_score = score_b - score_a
    if final_score > 0:
        return MatchScore.B
    elif final_score < 0:
        return MatchScore.A
    else:
        return MatchScore.Draw


def get_shuffled_results(matches: list[Match], model_names: list[str], seed: int = 0):
    random.seed(seed)
    ranker_shuffle = ELORanker(K=40)
    matches_shuffle = random.sample(matches, k=len(matches))
    ranker_shuffle.add_players(model_names)
    ranker_shuffle.compute_scores(matches=matches_shuffle)
    return ranker_shuffle.players

In [6]:
model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())
matches = [
    Match(
        match_dict["model_a_name"],
        match_dict["model_b_name"],
        compute_match_score(match_dict["score_a"], match_dict["score_b"]),
    )
    for match_dict in matches.to_dicts()
]

In [7]:
ranker = ELORanker(K=40)

random.seed(1337)
matches = random.sample(matches, k=len(matches))
ranker.add_players(model_names)  # type: ignore
ranker.compute_scores(matches=matches)
ranker.get_scores()

{'gemma-3-27b': 1191.3395006565374,
 'gemini-2.0-flash-exp': 1144.6204338742177,
 'gpt-4.1-mini': 1136.9810148078843,
 'claude-3-7-sonnet': 1136.9727109548724,
 'llama-3.1-nemotron-70b-instruct': 1129.535115932537,
 'deepseek-v3-0324': 1127.343019633871,
 'command-a': 1124.262695420955,
 'gemma-3-12b': 1110.973006187927,
 'gemini-2.0-flash-001': 1096.177320994962,
 'grok-3-mini-beta': 1095.5316626903282,
 'gemini-1.5-pro-002': 1094.2971057766233,
 'llama-4-scout': 1086.8984357930387,
 'mistral-large-2411': 1079.1626953808234,
 'gemma-3-4b': 1067.4333848739445,
 'claude-3-5-sonnet-v2': 1061.5224933944878,
 'deepseek-v3-chat': 1055.8583879004634,
 'gpt-4.1-nano': 1043.3380939198712,
 'gemini-1.5-pro-001': 1041.22455039952,
 'qwq-32b': 1024.5967895935885,
 'mistral-small-3.1-24b': 1024.3801866853216,
 'gpt-4o-2024-08-06': 1019.9560175155277,
 'deepseek-r1': 1008.2185359046271,
 'mixtral-8x7b-instruct-v0.1': 1000.331257052885,
 'c4ai-command-r-08-2024': 992.6319053544709,
 'mistral-saba': 

## Calcul d'un score de frugalité

Le score de frugalité est calculé à partir de données de consommation présentes dans le jeu de données `comparia-conversations`.

### Calcul du nombre de match et du nombre total de tokens générés par modèle

In [8]:
import polars as pl
from rank_comparia.frugality import get_n_match, get_models_output_tokens

number_by_model = get_n_match(ranker)
total_tokens = get_models_output_tokens(reactions)

number_by_model = number_by_model.join(total_tokens, on="model_name")

number_by_model

model_name,n_match,total_output_tokens
str,i64,f64
"""aya-expanse-8b""",519,961445.0
"""c4ai-command-r-08-2024""",1180,5.944341e6
"""chocolatine-2-14b-instruct-v2.…",598,511086.0
"""claude-3-5-sonnet-v2""",1994,2.818225e6
"""claude-3-7-sonnet""",319,2.162331e6
…,…,…
"""phi-3.5-mini-instruct""",462,851005.0
"""phi-4""",1616,3.086417e6
"""qwen2.5-7b-instruct""",427,870902.0
"""qwen2.5-coder-32b-instruct""",1641,4.787749e6


### Calcul du score de frugalité

Calcul du score énergétique. Il est possible de moyenner les scores avec le paramètre `mean` (si True, le score est moyenné, sinon non).  
Si on décide de moyenner, le moyennage par tokens et par nombre de match est effectué.

In [None]:
from rank_comparia.frugality import calculate_frugality_score, draw_chart

frugal_scores = calculate_frugality_score(reactions, number_by_model, mean=True)

frugal_scores

model_name,total_output_tokens,conso_all_conv,n_match,total_output_tokens_right,mean_conso_per_match,mean_conso_per_token
str,f64,f64,i64,f64,f64,f64
"""aya-expanse-8b""",961445.0,3.62259,519,961445.0,0.00698,0.000004
"""c4ai-command-r-08-2024""",5.944341e6,44.921088,1180,5.944341e6,0.038069,0.000008
"""chocolatine-2-14b-instruct-v2.…",511086.0,1.853976,598,511086.0,0.0031,0.000004
"""claude-3-5-sonnet-v2""",2.818225e6,378.314297,1994,2.818225e6,0.189726,0.000134
"""claude-3-7-sonnet""",2.162331e6,290.26807,319,2.162331e6,0.909931,0.000134
…,…,…,…,…,…,…
"""phi-3.5-mini-instruct""",851005.0,2.609332,462,851005.0,0.005648,0.000003
"""phi-4""",3.086417e6,14.228012,1616,3.086417e6,0.008804,0.000005
"""qwen2.5-7b-instruct""",870902.0,3.159217,427,870902.0,0.007399,0.000004
"""qwen2.5-coder-32b-instruct""",4.787749e6,34.16509,1641,4.787749e6,0.02082,0.000007


In [10]:
elo_scores = pl.DataFrame(
    {
        "model_name": ranker.players.keys(),
        "elo_score": ranker.players.values(),
    },
    strict=False,
).sort(by="elo_score", descending=True)

elo_scores

model_name,elo_score
str,f64
"""gemma-3-27b""",1191.339501
"""gemini-2.0-flash-exp""",1144.620434
"""gpt-4.1-mini""",1136.981015
"""claude-3-7-sonnet""",1136.972711
"""llama-3.1-nemotron-70b-instruc…",1129.535116
…,…
"""gpt-4o-mini-2024-07-18""",882.995125
"""phi-3.5-mini-instruct""",859.943908
"""ministral-8b-instruct-2410""",846.897052
"""gemma-2-9b-it""",804.923561


## Création du graphique de frugalité

### Chargement des informations concernant les modèles du comparateur

In [11]:
from pathlib import Path

info_model = pl.read_json(source=Path(".").resolve().parent / "data" / "models_data.json")

In [12]:
final_df = info_model.join(elo_scores, on="model_name").join(frugal_scores, on="model_name")

final_df

name,model_name,organization,license,elo_score,total_output_tokens,conso_all_conv,n_match,total_output_tokens_right,mean_conso_per_match,mean_conso_per_token
str,str,str,str,f64,f64,f64,i64,f64,f64,f64
"""Aya-Expanse-8B""","""aya-expanse-8b""","""Cohere""","""CC-BY-NC-4.0""",971.816744,961445.0,3.62259,519,961445.0,0.00698,0.000004
"""Command R (08-2024)""","""c4ai-command-r-08-2024""","""Cohere""","""CC-BY-NC-4.0""",992.631905,5.944341e6,44.921088,1180,5.944341e6,0.038069,0.000008
"""Chocolatine-2-14b Instruct""","""chocolatine-2-14b-instruct-v2.…","""jpacifico (individual)""","""Apache 2.0""",799.745462,511086.0,1.853976,598,511086.0,0.0031,0.000004
"""Claude 3.5 Sonnet V2""","""claude-3-5-sonnet-v2""","""Anthropic""","""Proprietary""",1061.522493,2.818225e6,378.314297,1994,2.818225e6,0.189726,0.000134
"""Command A""","""command-a""","""Cohere""","""CC-BY-NC-4.0""",1124.262695,1.03253e6,18.815316,457,1.03253e6,0.041171,0.000018
…,…,…,…,…,…,…,…,…,…,…
"""Phi-3.5 Mini Instruct""","""phi-3.5-mini-instruct""","""Microsoft""","""MIT""",859.943908,851005.0,2.609332,462,851005.0,0.005648,0.000003
"""Phi 4""","""phi-4""","""Microsoft""","""MIT""",907.827299,3.086417e6,14.228012,1616,3.086417e6,0.008804,0.000005
"""Qwen2.5-7B""","""qwen2.5-7b-instruct""","""Alibaba""","""Apache 2.0""",932.13945,870902.0,3.159217,427,870902.0,0.007399,0.000004
"""Qwen2.5-Coder-32B-Instruct""","""qwen2.5-coder-32b-instruct""","""Alibaba""","""Apache 2.0""",942.24994,4.787749e6,34.16509,1641,4.787749e6,0.02082,0.000007


In [13]:
from rank_comparia.utils import save_data

save_path = Path(".").resolve().parent / "data"
save_data(final_df, "all_info_for_chart_drawing", save_path)

### Génération du graphique de frugalité

Les paramètres possibles :  
- `log` : Ajuster l'échelle du graphique en linéaire (`log = False`) ou en log (`log = True`) ; 
- `mean` : Utiliser les consommation moyenné (`mean = True`) ou non (`mean = False`) ;  
- `scale` :  choix du moyennage si `mean = True`. `token` si on utilise le moyennage par token, `match` si on utilise le moyennage par nombre de match ;  
- `save` : Enregistrement du graphique au format html.

In [15]:
draw_chart(final_df, title="consommation selon classement", log=True, scale="token", mean=False, save=True)