# Axe 2

#### Packages

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import random


### 1) Processing du df composant m√©dical - Emission carbonne (kgCO2e)

In [None]:


def concat_xlsx_from_folder(folder_path: str) -> pd.DataFrame:
    """
    Parcourt r√©cursivement un dossier git et concat√®ne verticalement
    toutes les tables issues des fichiers .xlsx.

    Hypoth√®ses :
    - chaque fichier .xlsx contient une table avec exactement 2 colonnes
    - la premi√®re ligne correspond aux labels et est ignor√©e
    - colonne A : produit
    - colonne B : Emission_kgCO2e_unitaire
    """

    folder = Path(folder_path)
    all_rows = []

    for file in folder.rglob("*.xlsx"):
        df = pd.read_excel(file, header=0)

        df = df.iloc[:, :2]
        df.columns = ["produit", "Emission_kgCO2e_unitaire"]

        filename = file.stem
        if filename.endswith("_parProduit"):
            df["type_de_donnees"] = "parProduit"
        elif filename.endswith("_m2"):
            df["type_de_donnees"] = "m2"
        elif filename.endswith("_parKG"):
            df["type_de_donnees"] = "parKG"
        else:
            continue

        all_rows.append(df)

    if not all_rows:
        return pd.DataFrame(
            columns=["produit", "Emission_kgCO2e_unitaire", "type_de_donnees"]
        )

    df = pd.concat(all_rows, axis=0, ignore_index=True)

    df["Emission_kgCO2e_unitaire"] = (
        df["Emission_kgCO2e_unitaire"]
        .astype(str)
        .str.replace(",", ".", regex=False)
    )
    
    df["Emission_kgCO2e_unitaire"] = pd.to_numeric(
        df["Emission_kgCO2e_unitaire"],
        errors="coerce"
    )
    
    return df


def concat_xlsx_from_folders(list_paths: list[str | Path]) -> pd.DataFrame:
    """
    Concat√®ne verticalement les tables issues de plusieurs dossiers.

    Param√®tre
    ----------
    list_paths : list[str | Path]
        Liste de chemins vers des dossiers contenant des fichiers .xlsx

    Retour
    ------
    DataFrame avec les colonnes :
    - produit
    - Emission_kgCO2e_unitaire
    - type_de_donnees
    """

    all_rows = []

    for folder_path in list_paths:
        folder = Path(folder_path)

        if not folder.exists():
            raise FileNotFoundError(f"Dossier introuvable : {folder}")

        xlsx_files = list(folder.rglob("*.xlsx"))
        if not xlsx_files:
            raise ValueError(f"Aucun fichier .xlsx trouv√© dans {folder}")

        for file in xlsx_files:
            df = pd.read_excel(file)

            df = df.iloc[:, :2]
            df.columns = ["produit", "Emission_kgCO2e_unitaire"]

            name = file.stem
            if name.endswith("_parProduit"):
                df["type_de_donnees"] = "parProduit"
            elif name.endswith("_m2"):
                df["type_de_donnees"] = "m2"
            elif name.endswith("_parKG"):
                df["type_de_donnees"] = "parKG"
            else:
                continue

            all_rows.append(df)

    if not all_rows:
        return pd.DataFrame(
            columns=["produit", "Emission_kgCO2e_unitaire", "type_de_donnees"]
        )

    df = pd.concat(all_rows, axis=0, ignore_index=True)
    
    df["Emission_kgCO2e_unitaire"] = (
        df["Emission_kgCO2e_unitaire"]
        .astype(str)
        .str.replace(",", ".", regex=False)
    )

    df["Emission_kgCO2e_unitaire"] = pd.to_numeric(
        df["Emission_kgCO2e_unitaire"],
        errors="coerce"
    )
    
    return df



def random_value_dict(
    df: pd.DataFrame,
    nom_col: str = "produit",
    nom_to_ignore: list | None = None,
    min_value: float = 1.0,
    max_value: float = 3000.0,
) -> dict:
    """
    Construit un dictionnaire :
    - cl√©s : valeurs uniques de df[nom_col]
    - valeurs : nombre al√©atoire strictement > 1
      tir√© dans [min_value, max_value]
    """

    if nom_to_ignore is None:
        nom_to_ignore = []

    if min_value <= 1:
        min_value = 1.000001

    uniques = df[nom_col].dropna().unique()

    return {
        val: random.uniform(min_value, max_value)
        for val in uniques
        if val not in nom_to_ignore
    }


def compute_emission_hopital(
    df: pd.DataFrame,
    type_de_donnees: str = "type_de_donnees",
    dict_m2: dict = {"pansements composites": (3 * 10**2, 10)},
    dict_parKG: dict = {"instrument usage unique": 1000, "complement alimentaire":10000},
    dict_nb_parProduit: dict | None = None,
) -> pd.DataFrame:
    """
    Ajoute la colonne Emission_carbonne_total_des_produits_kgCO2e selon
    le type de donn√©es associ√© √† chaque produit.
    """

    if dict_nb_parProduit is None:
        dict_nb_parProduit = {}

    def compute_row(row):
        produit = row["produit"]
        emission_unit = row["Emission_kgCO2e_unitaire"]
        t = row[type_de_donnees]

        if t == "parProduit":
            return emission_unit * dict_nb_parProduit.get(produit, np.nan)

        if t == "m2":
            longueur, largeur = dict_m2.get(produit, (np.nan, np.nan))
            return emission_unit * longueur * largeur

        if t == "parKG":
            return emission_unit * dict_parKG.get(produit, np.nan)

        return np.nan

    df = df.copy()
    df["Emission_carbonne_total_des_produits_kgCO2e"] = df.apply(compute_row, axis=1)

    return df

# =========================
# 3. Construction du DataFrame
# =========================

# extract_path = "sujets/chu/Axe_2/Axe_2_bdd"
# extract_path = "C:/Users/jerem/Documents/GitHub/datachallenge2026/sujets/chu/Axe_2/Axe_2_bdd"
extract_path = "/home/onyxia/datachallenge2026/sujets/chu/Axe_2/Axe_2_bdd"
# paths = [
#     r"sujets\chu\Axe_2\Axe_2_bdd-20260117T004817Z-1-001\Axe_2_bdd",
#     r"sujets\chu\Axe_2\autre_dossier"
# ]

df_concat = concat_xlsx_from_folder(extract_path)
# df_concat = concat_xlsx_from_folders(paths)

print(df_concat.head())

# =========================
# 4. Dictionnaires exemples
# =========================

dict_nb_parProduit = random_value_dict(df_concat)

print(dict_nb_parProduit)

# =========================
# 5. Calcul des √©missions
# =========================

df_final = compute_emission_hopital(
    df_concat,
    dict_nb_parProduit=dict_nb_parProduit
)

print(df_final.head())



                   produit  Emission_kgCO2e_unitaire type_de_donnees
0    pansements composites                    0.9400              m2
1           Sonde urinaire                   51.9034      parProduit
2  Set de sondage urinaire                   89.4064      parProduit
3      Collecteur de jambe                  250.7186      parProduit
4       Collecteur de nuit                  676.6522      parProduit
{'pansements composites': 2880.7506438973483, 'Sonde urinaire': 2.3058391190603773, 'Set de sondage urinaire': 1301.0278534529064, 'Collecteur de jambe': 1871.2095479103848, 'Collecteur de nuit': 2265.768898794339, 'Etuis p√©niens': 1088.7167868279912, 'Poche pour stomie': 3.131259971503482, 'Support pour stomie': 2959.0238083770532, 'Changes complets': 2375.889528275952, 'Slips absorbants': 250.72891706628525, 'Protections absorbantes': 1887.063988630245, 'Couches droites': 1329.456550312355, 'Al√®ses': 1383.9846297425656, 'pansement': 2495.056711895948, 'uteruscopes': 2579.2114

In [35]:
# Doublons sur une variable particuli√®re (ex: 'email')
colonne = 'produit'

# Trouver les valeurs dupliqu√©es dans cette colonne
valeurs_doublons = df_final[df_final.duplicated(subset=[colonne], keep=False)]

# Afficher les doublons tri√©s pour mieux voir
doublons_tries = valeurs_doublons.sort_values(colonne)
print(f"Doublons sur la colonne '{colonne}' :")
print(doublons_tries)

# Voir les valeurs qui se r√©p√®tent
comptage = df_final[colonne].value_counts()
valeurs_repetees = comptage[comptage > 1]
print(f"\nValeurs r√©p√©t√©es dans '{colonne}' :")
print(valeurs_repetees)


Doublons sur la colonne 'produit' :
Empty DataFrame
Columns: [produit, Emission_kgCO2e_unitaire, type_de_donnees, Emission_carbonne_total_des_produits_kgCO2e]
Index: []

Valeurs r√©p√©t√©es dans 'produit' :
Series([], Name: count, dtype: int64)


#### 1.1) Obtention du dataframe final et exportation

In [36]:
colonnes_a_conserver = [
    "produit",
    "Emission_kgCO2e_unitaire",
    "Emission_carbonne_total_des_produits_kgCO2e"
]

df_export = df_final[colonnes_a_conserver].copy()

# =========================
# Chemin de sortie
# =========================

output_path = Path(r"results\df_composant_medical_emissions_carbones.xlsx")

# Cr√©ation du dossier si besoin
output_path.parent.mkdir(parents=True, exist_ok=True)

# =========================
# Export Excel
# =========================

df_export.to_excel(output_path, index=False)

### 2) Pre processing NLP des bases **df_composant_medical_emissions_carbones.xlsx** et **DISPOSITIFS_MED.xlsx** et Classification

In [8]:
# on sup un encoding: utf-8
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Optional, Tuple

import torch

import numpy as np
import pandas as pd

# !pip install sentence_transformers
from sentence_transformers import SentenceTransformer

# !pip install transformers
from transformers import pipeline

# !pip install rank_bm25
from rank_bm25 import BM25Okapi

# !pip install spacy transformers sentencepiece torch
# !python -m spacy download fr_core_news_md
# !python -m spacy download en_core_web_sm
# !pip install rank-bm25
# !pip install sentence-transformers

# !pip install openpyxl
# !pip install sentencepiece

# RAPPEL : relancer le kernel si pb d√©tection de packages

In [9]:


# print("CUDA available :", torch.cuda.is_available())
# print("GPU name :", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

# import torch
# print(torch.cuda.memory_allocated() / 1e9, "GB GPU used")

# !nvidia-smi


In [None]:

# ============================================================
# 0) I/O + s√©lection colonnes df2
# ============================================================

DF2_KEEP_COLS = [
    "Nomenclature achat",
    "Cat√©gories d'achat\n(N-2)",
    "Segments  d'achat\n(N-3)",
    "Sous-segment",
    "Produit √©l√©mentaire",
    "Code des Cat√©gories Homog√®nes \nde fournitures et prestations",
]

def load_and_select_df2(path_df2_xlsx: str) -> pd.DataFrame:
    df2 = pd.read_excel(path_df2_xlsx)
    missing = [c for c in DF2_KEEP_COLS if c not in df2.columns]
    if missing:
        raise ValueError(f"Colonnes manquantes dans df2: {missing}\nColonnes trouv√©es: {list(df2.columns)}")
    return df2[DF2_KEEP_COLS].copy()




# ============================================================
# 1) Pr√©traitement + traduction (identique logique)
# ============================================================



def build_fr_nlp(model_name: str = "fr_core_news_md"):
    import spacy
    return spacy.load(model_name, disable=["ner", "parser"])

def build_en_nlp(model_name: str = "en_core_web_sm"):
    import spacy
    return spacy.load(model_name, disable=["ner", "parser"])

def preprocess_fr(texts: Iterable[str], nlp=None) -> List[str]:
    if nlp is None:
        nlp = build_fr_nlp()
    out = []
    for doc in nlp.pipe([("" if x is None else str(x)) for x in texts], batch_size=256):
        toks = []
        for t in doc:
            if t.is_space or t.is_punct or t.like_num:
                continue
            if t.is_stop:
                continue
            lem = (t.lemma_ or t.text).lower().strip()
            if len(lem) < 2:
                continue
            toks.append(lem)
        out.append(" ".join(toks))
    return out

def preprocess_en(texts: Iterable[str], nlp=None) -> List[str]:
    if nlp is None:
        nlp = build_en_nlp()
    out = []
    for doc in nlp.pipe([("" if x is None else str(x)) for x in texts], batch_size=256):
        toks = []
        for t in doc:
            if t.is_space or t.is_punct or t.like_num:
                continue
            if t.is_stop:
                continue
            lem = (t.lemma_ or t.text).lower().strip()
            if len(lem) < 2:
                continue
            toks.append(lem)
        out.append(" ".join(toks))
    return out

@dataclass
class TranslatorFR2EN:
    model_name: str = "Helsinki-NLP/opus-mt-fr-en"
    # device: int = -1  # -1 CPU, 0 GPU
    device: int = 0  # -1 CPU, 0 GPU

    def __post_init__(self):
        self.pipe = pipeline("translation", model=self.model_name, device=self.device)

    def translate(self, texts: Iterable[str], batch_size: int = 16) -> List[str]:
        texts_list = [("" if x is None else str(x)) for x in texts]
        outputs = self.pipe(texts_list, batch_size=batch_size, truncation=True)
        return [o["translation_text"] for o in outputs]


def add_processed_columns(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    col_df1_produit: str = "produit",
    col_df2_best: str = "Produit √©l√©mentaire",
    translator: Optional[TranslatorFR2EN] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    On se concentre sur 'Produit √©l√©mentaire' pour df2 (champ le plus proche),
    et on garde aussi un champ df2 '__df2_join_en_proc' pour √©ventuellement enrichir.
    """
    if col_df1_produit not in df1.columns:
        raise ValueError(f"df1 n'a pas la colonne {col_df1_produit}")
    if col_df2_best not in df2.columns:
        raise ValueError(f"df2 n'a pas la colonne {col_df2_best}")

    fr_nlp = build_fr_nlp()
    en_nlp = build_en_nlp()
    if translator is None:
        translator = TranslatorFR2EN()

    df1 = df1.copy()
    df2 = df2.copy()

    # df1 produit
    df1["produit_fr_proc"] = preprocess_fr(df1[col_df1_produit].astype(str), nlp=fr_nlp)
    df1["produit_en"] = translator.translate(df1["produit_fr_proc"].tolist())
    df1["produit_en_proc"] = preprocess_en(df1["produit_en"], nlp=en_nlp)

    # df2 produit √©l√©mentaire (principal)
    df2["produit_elem_fr_proc"] = preprocess_fr(df2[col_df2_best].astype(str), nlp=fr_nlp)
    df2["produit_elem_en"] = translator.translate(df2["produit_elem_fr_proc"].tolist())
    df2["produit_elem_en_proc"] = preprocess_en(df2["produit_elem_en"], nlp=en_nlp)

    # champ joint optionnel (pond√©ration: Produit √©l√©mentaire x3)
    # utile si tu veux plus tard int√©grer d'autres colonnes, sans casser l'approche
    df2["__df2_join_en_proc"] = (
        (df2["produit_elem_en_proc"].fillna("") + " ") * 3
    ).str.replace(r"\s+", " ", regex=True).str.strip()

    return df1, df2


# ============================================================
# 2) Filtre lexical BM25 (avant embeddings)
# ============================================================

def bm25_candidates(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    df1_text_col: str = "produit_en_proc",
    df2_text_col: str = "produit_elem_en_proc",
    topk_bm25: int = 20,
) -> np.ndarray:
    """
    Retourne un tableau d'indices (n_df2, topk_bm25) : les meilleurs candidats df1
    pour chaque ligne df2 selon BM25.

    On tokenize simplement par split() car les textes sont d√©j√† normalis√©s.
    """
    

    corpus_tokens = [str(x).split() for x in df1[df1_text_col].fillna("").tolist()]
    bm25 = BM25Okapi(corpus_tokens)

    cand_idx = np.zeros((df2.shape[0], topk_bm25), dtype=int)

    for i, q in enumerate(df2[df2_text_col].fillna("").tolist()):
        q_tokens = str(q).split()
        scores = bm25.get_scores(q_tokens)  # (n_df1,)
        best = np.argsort(-scores)[:topk_bm25]
        cand_idx[i, :] = best

    return cand_idx


# ============================================================
# 3) Rerank embeddings sur candidats + proba Top-5
# ============================================================

def embed_texts(texts: List[str], model_name: str = "pritamdeka/S-PubMedBert-MS-MARCO") -> np.ndarray:
    model = SentenceTransformer(model_name)
    emb = model.encode(texts, normalize_embeddings=True, batch_size=64, show_progress_bar=True)
    return np.asarray(emb)

def softmax(x: np.ndarray, temperature: float = 0.07) -> np.ndarray:
    x = x / max(temperature, 1e-6)
    x = x - x.max(axis=1, keepdims=True)
    expx = np.exp(x)
    return expx / expx.sum(axis=1, keepdims=True)

def match_with_bm25_then_embeddings(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    col_df1_key: str = "produit",
    df1_text_col: str = "produit_en_proc",
    df2_text_col: str = "produit_elem_en_proc",
    topk_bm25: int = 20,
    topk_final: int = 5,
    embedding_model: str = "pritamdeka/S-PubMedBert-MS-MARCO",
    temperature: float = 0.07,
) -> pd.DataFrame:
    """
    Pipeline:
    - BM25 filtre les candidats df1 (topk_bm25)
    - embeddings rerank uniquement ces candidats
    - softmax sur similarit√©s => pseudo-proba
    - renvoie un tableau wide top-5 (et long via attrs)
    """

    # df1 unique
    df1u = df1[[col_df1_key, df1_text_col]].drop_duplicates(subset=[col_df1_key]).reset_index(drop=True)

    # candidats BM25
    cand_idx = bm25_candidates(
        df1u,
        df2,
        df1_text_col=df1_text_col,
        df2_text_col=df2_text_col,
        topk_bm25=topk_bm25,
    )  # (n2, topk_bm25)

    # embeddings df1 (une seule fois)
    emb1 = embed_texts(df1u[df1_text_col].fillna("").tolist(), model_name=embedding_model)

    # embeddings df2 (sur champ principal)
    emb2 = embed_texts(df2[df2_text_col].fillna("").tolist(), model_name=embedding_model)

    # calcul similarities restreint
    n2 = df2.shape[0]
    sims = np.empty((n2, topk_bm25), dtype=float)

    for i in range(n2):
        idx = cand_idx[i]
        sims[i, :] = emb2[i] @ emb1[idx].T  # cosine car normalis√©

    probs = softmax(sims, temperature=temperature)  # (n2, topk_bm25)

    # topk_final parmi candidats
    top_local = np.argsort(-probs, axis=1)[:, :topk_final]             # indices 0..topk_bm25-1
    top_prob = np.take_along_axis(probs, top_local, axis=1)            # (n2, topk_final)
    top_global_idx = np.take_along_axis(cand_idx, top_local, axis=1)   # indices dans df1u
    top_prod = df1u[col_df1_key].to_numpy()[top_global_idx]            # (n2, topk_final)

    # outputs
    rows = []
    for i in range(n2):
        for r in range(topk_final):
            rows.append({
                "Nomenclature achat": df2.iloc[i]["Nomenclature achat"],
                "rank": r + 1,
                "produit_match": top_prod[i, r],
                "proba": float(top_prob[i, r]),
            })
    out_long = pd.DataFrame(rows)

    wide = {"Nomenclature achat": df2["Nomenclature achat"].to_numpy()}
    for r in range(topk_final):
        wide[f"top{r+1}_produit"] = top_prod[:, r]
        wide[f"top{r+1}_proba"] = top_prob[:, r]
    out_wide = pd.DataFrame(wide)

    out_wide.attrs["out_long"] = out_long
    return out_wide


# ============================================================
# 4) Utilitaires pratiques
# ============================================================

def keep_df2_columns(df2: pd.DataFrame) -> pd.DataFrame:
    # m√™mes colonnes que tu veux conserver
    keep = [
        "Nomenclature achat",
        "Cat√©gories d'achat\n(N-2)",
        "Segments  d'achat\n(N-3)",
        "Sous-segment",
        "Produit √©l√©mentaire",
        "Code des Cat√©gories Homog√®nes \nde fournitures et prestations",
    ]
    missing = [c for c in keep if c not in df2.columns]
    if missing:
        raise ValueError(f"Colonnes manquantes: {missing}\nColonnes df2: {list(df2.columns)}")
    return df2[keep].copy()


# ============================================================
# 5) Exemple d'ex√©cution
# ============================================================

if __name__ == "__main__":
    # path_df1 = r"df_composant_medical_emissions_carbones.xlsx"
    # path_df2 = r"DISPOSITIFS_MED.xlsx"

    path_df1 = r"/home/onyxia/datachallenge2026/sujets/chu/Axe_2/results/df_composant_medical_emissions_carbones.xlsx"
    path_df2 = r"/home/onyxia/datachallenge2026/sujets/chu/Axe_2/DISPOSITIFS_MED.xlsx"

    df1 = pd.read_excel(path_df1)
    df2 = load_and_select_df2(path_df2)

    # print(df1.head())
    # print(df2.head())

    # TEMPS : 1min9s
    # NLP + traduction (1min9s)
    translator = TranslatorFR2EN(device=0)  # GPU
    df1p, df2p = add_processed_columns(
        df1, df2,
        col_df1_produit="produit",
        col_df2_best="Produit √©l√©mentaire",
        translator=translator
    )


    # TEMPS : 24s
    # Matching BM25 -> Embeddings -> Top5 
    match_wide = match_with_bm25_then_embeddings(
        df1p, df2p,
        col_df1_key="produit",
        df1_text_col="produit_en_proc",
        df2_text_col="produit_elem_en_proc",
        topk_bm25=20,         
        topk_final=5,
        embedding_model="pritamdeka/S-PubMedBert-MS-MARCO",
        temperature=0.07
    )

    # # Sauvegarde
    # match_wide.to_excel("/home/onyxia/datachallenge2026/sujets/chu/Axe_2/results/MATCH_df2_vers_df1_top5.xlsx", index=False)
    # match_wide.attrs["out_long"].to_excel("/home/onyxia/datachallenge2026/sujets/chu/Axe_2/results/MATCH_df2_vers_df1_top5_long.xlsx", index=False)



Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 17.75it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 43/43 [00:04<00:00,  9.54it/s]


In [13]:
# # df1 anglais
# print(df1p.head()) 

# # df2 anglais
# print(df2p.head()) 

### 2bis) Version o√π on remplace la similarit√© cosinus par une p√©nalisation TF-IDF

In [None]:
# ============================================================
# 0) I/O + s√©lection colonnes df2
# ============================================================

DF2_KEEP_COLS = [
    "Nomenclature achat",
    "Cat√©gories d'achat\n(N-2)",
    "Segments  d'achat\n(N-3)",
    "Sous-segment",
    "Produit √©l√©mentaire",
    "Code des Cat√©gories Homog√®nes \nde fournitures et prestations",
]

def load_and_select_df2(path_df2_xlsx: str) -> pd.DataFrame:
    df2 = pd.read_excel(path_df2_xlsx)
    missing = [c for c in DF2_KEEP_COLS if c not in df2.columns]
    if missing:
        raise ValueError(f"Colonnes manquantes dans df2: {missing}\nColonnes trouv√©es: {list(df2.columns)}")
    return df2[DF2_KEEP_COLS].copy()




# ============================================================
# 1) Pr√©traitement + traduction (identique logique)
# ============================================================



def build_fr_nlp(model_name: str = "fr_core_news_md"):
    import spacy
    return spacy.load(model_name, disable=["ner", "parser"])

def build_en_nlp(model_name: str = "en_core_web_sm"):
    import spacy
    return spacy.load(model_name, disable=["ner", "parser"])

def preprocess_fr(texts: Iterable[str], nlp=None) -> List[str]:
    if nlp is None:
        nlp = build_fr_nlp()
    out = []
    for doc in nlp.pipe([("" if x is None else str(x)) for x in texts], batch_size=256):
        toks = []
        for t in doc:
            if t.is_space or t.is_punct or t.like_num:
                continue
            if t.is_stop:
                continue
            lem = (t.lemma_ or t.text).lower().strip()
            if len(lem) < 2:
                continue
            toks.append(lem)
        out.append(" ".join(toks))
    return out

def preprocess_en(texts: Iterable[str], nlp=None) -> List[str]:
    if nlp is None:
        nlp = build_en_nlp()
    out = []
    for doc in nlp.pipe([("" if x is None else str(x)) for x in texts], batch_size=256):
        toks = []
        for t in doc:
            if t.is_space or t.is_punct or t.like_num:
                continue
            if t.is_stop:
                continue
            lem = (t.lemma_ or t.text).lower().strip()
            if len(lem) < 2:
                continue
            toks.append(lem)
        out.append(" ".join(toks))
    return out

@dataclass
class TranslatorFR2EN:
    model_name: str = "Helsinki-NLP/opus-mt-fr-en"
    # device: int = -1  # -1 CPU, 0 GPU
    device: int = 0  # -1 CPU, 0 GPU

    def __post_init__(self):
        self.pipe = pipeline("translation", model=self.model_name, device=self.device)

    def translate(self, texts: Iterable[str], batch_size: int = 16) -> List[str]:
        texts_list = [("" if x is None else str(x)) for x in texts]
        outputs = self.pipe(texts_list, batch_size=batch_size, truncation=True)
        return [o["translation_text"] for o in outputs]


def add_processed_columns(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    col_df1_produit: str = "produit",
    col_df2_best: str = "Produit √©l√©mentaire",
    translator: Optional[TranslatorFR2EN] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    On se concentre sur 'Produit √©l√©mentaire' pour df2 (champ le plus proche),
    et on garde aussi un champ df2 '__df2_join_en_proc' pour √©ventuellement enrichir.
    """
    if col_df1_produit not in df1.columns:
        raise ValueError(f"df1 n'a pas la colonne {col_df1_produit}")
    if col_df2_best not in df2.columns:
        raise ValueError(f"df2 n'a pas la colonne {col_df2_best}")

    fr_nlp = build_fr_nlp()
    en_nlp = build_en_nlp()
    if translator is None:
        translator = TranslatorFR2EN()

    df1 = df1.copy()
    df2 = df2.copy()

    # df1 produit
    df1["produit_fr_proc"] = preprocess_fr(df1[col_df1_produit].astype(str), nlp=fr_nlp)
    df1["produit_en"] = translator.translate(df1["produit_fr_proc"].tolist())
    df1["produit_en_proc"] = preprocess_en(df1["produit_en"], nlp=en_nlp)

    # df2 produit √©l√©mentaire (principal)
    df2["produit_elem_fr_proc"] = preprocess_fr(df2[col_df2_best].astype(str), nlp=fr_nlp)
    df2["produit_elem_en"] = translator.translate(df2["produit_elem_fr_proc"].tolist())
    df2["produit_elem_en_proc"] = preprocess_en(df2["produit_elem_en"], nlp=en_nlp)

    # champ joint optionnel (pond√©ration: Produit √©l√©mentaire x3)
    # utile si tu veux plus tard int√©grer d'autres colonnes, sans casser l'approche
    df2["__df2_join_en_proc"] = (
        (df2["produit_elem_en_proc"].fillna("") + " ") * 3
    ).str.replace(r"\s+", " ", regex=True).str.strip()

    return df1, df2


# ============================================================
# 2) Filtre lexical BM25 (avant embeddings)
# ============================================================

def bm25_candidates(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    df1_text_col: str = "produit_en_proc",
    df2_text_col: str = "produit_elem_en_proc",
    topk_bm25: int = 20,
) -> np.ndarray:
    """
    Retourne un tableau d'indices (n_df2, topk_bm25) : les meilleurs candidats df1
    pour chaque ligne df2 selon BM25.

    On tokenize simplement par split() car les textes sont d√©j√† normalis√©s.
    """
    

    corpus_tokens = [str(x).split() for x in df1[df1_text_col].fillna("").tolist()]
    bm25 = BM25Okapi(corpus_tokens)

    cand_idx = np.zeros((df2.shape[0], topk_bm25), dtype=int)

    for i, q in enumerate(df2[df2_text_col].fillna("").tolist()):
        q_tokens = str(q).split()
        scores = bm25.get_scores(q_tokens)  # (n_df1,)
        best = np.argsort(-scores)[:topk_bm25]
        cand_idx[i, :] = best

    return cand_idx


# ============================================================
# 3) Rerank embeddings sur candidats + proba Top-5
# ============================================================

def embed_texts(texts: List[str], model_name: str = "pritamdeka/S-PubMedBert-MS-MARCO") -> np.ndarray:
    model = SentenceTransformer(model_name)
    emb = model.encode(texts, normalize_embeddings=True, batch_size=64, show_progress_bar=True)
    return np.asarray(emb)

def softmax(x: np.ndarray, temperature: float = 0.07) -> np.ndarray:
    x = x / max(temperature, 1e-6)
    x = x - x.max(axis=1, keepdims=True)
    expx = np.exp(x)
    return expx / expx.sum(axis=1, keepdims=True)


# ‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
# ‚ïë  DIFF√âRENCE ENTRE LES DEUX APPROCHES DE SIMILARIT√â                            ‚ïë
# ‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù

# 1) SIMILARIT√â COSINUS PURE (comment√©e ci-dessous) :
#    - Mesure uniquement l'orientation des vecteurs embeddings
#    - Ignore compl√®tement les mots exacts utilis√©s
#    - Exemple : "cath√©ter veineux" vs "tube sanguin" ‚Üí score √©lev√© (s√©mantique)
#    - Probl√®me : peut matcher des produits s√©mantiquement proches mais techniquement 
#      diff√©rents (ex: "seringue 5ml" vs "seringue 10ml")

# 2) SIMILARIT√â HYBRIDE COSINUS + TF-IDF (impl√©mentation actuelle) :
#    - Combine s√©mantique (embeddings) + lexical (TF-IDF)
#    - TF-IDF donne plus de poids aux termes rares/sp√©cifiques
#    - Exemple : "cath√©ter ventriculaire" ‚Üí "ventriculaire" p√®se plus lourd que 
#      "cath√©ter" (plus commun)
#    - Avantage : d√©tecte les correspondances exactes de termes techniques tout en 
#      gardant la compr√©hension s√©mantique
#    - Param√®tre alpha : contr√¥le l'√©quilibre entre les deux mesures
#      * alpha=1.0 ‚Üí 100% embeddings (cosinus pur)
#      * alpha=0.0 ‚Üí 100% TF-IDF (lexical pur)
#      * alpha=0.6 ‚Üí compromis pour nomenclatures m√©dicales

# INTERPR√âTABILIT√â :
# - Score final = (0.6 √ó similarit√©_s√©mantique) + (0.4 √ó importance_termes_communs)
# - Favorise les produits qui sont √† la fois :
#   1) S√©mantiquement proches (compris par le mod√®le)
#   2) Partageant des termes techniques sp√©cifiques



def match_with_bm25_then_embeddings(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    col_df1_key: str = "produit",
    df1_text_col: str = "produit_en_proc",
    df2_text_col: str = "produit_elem_en_proc",
    topk_bm25: int = 20,
    topk_final: int = 5,
    embedding_model: str = "pritamdeka/S-PubMedBert-MS-MARCO",
    temperature: float = 0.07,
    alpha: float = 0.6,  # Pond√©ration embeddings vs TF-IDF
) -> pd.DataFrame:
    """
    Pipeline:
    - BM25 filtre les candidats df1 (topk_bm25)
    - embeddings + TF-IDF rerank uniquement ces candidats
    - softmax sur similarit√©s => pseudo-proba
    - renvoie un tableau wide top-5 (et long via attrs)
    
    Param√®tres:
        alpha: Poids des embeddings (1-alpha = poids TF-IDF)
               alpha=1.0 ‚Üí similarit√© cosinus pure
               alpha=0.6 ‚Üí recommand√© (√©quilibre s√©mantique/lexical)
    """
    from sklearn.feature_extraction.text import TfidfVectorizer
    from scipy.stats import spearmanr

    # df1 unique
    df1u = df1[[col_df1_key, df1_text_col]].drop_duplicates(subset=[col_df1_key]).reset_index(drop=True)

    # candidats BM25
    cand_idx = bm25_candidates(
        df1u,
        df2,
        df1_text_col=df1_text_col,
        df2_text_col=df2_text_col,
        topk_bm25=topk_bm25,
    )  # (n2, topk_bm25)

    # embeddings df1 (une seule fois)
    emb1 = embed_texts(df1u[df1_text_col].fillna("").tolist(), model_name=embedding_model)

    # embeddings df2 (sur champ principal)
    emb2 = embed_texts(df2[df2_text_col].fillna("").tolist(), model_name=embedding_model)

    # TF-IDF sur corpus df1 unique
    print("Calcul TF-IDF...")
    vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_matrix_df1 = vectorizer.fit_transform(df1u[df1_text_col].fillna(""))
    
    # TF-IDF sur df2
    tfidf_matrix_df2 = vectorizer.transform(df2[df2_text_col].fillna(""))

    # calcul similarities restreint (HYBRIDE)
    n2 = df2.shape[0]
    sims = np.empty((n2, topk_bm25), dtype=float)
    
    # Stockage pour calcul d'incertitude
    embedding_sims_all = np.empty((n2, topk_bm25), dtype=float)
    tfidf_sims_all = np.empty((n2, topk_bm25), dtype=float)

    print("Calcul similarit√©s hybrides (embeddings + TF-IDF)...")
    for i in range(n2):
        idx = cand_idx[i]
        
        # # APPROCHE 1 (comment√©e) : Similarit√© cosinus pure sur embeddings
        # cos_sim = emb2[i] @ emb1[idx].T  # cosine car normalis√©
        # sims[i, :] = cos_sim
        
        # APPROCHE 2 (actuelle) : Hybride embeddings + TF-IDF
        # 1) Similarit√© s√©mantique (embeddings)
        embedding_sim = emb2[i] @ emb1[idx].T  # (topk_bm25,)
        
        # 2) Similarit√© lexicale (TF-IDF sur termes communs)
        tfidf_sim = (tfidf_matrix_df2[i] @ tfidf_matrix_df1[idx].T).toarray()[0]  # (topk_bm25,)
        
        # Stockage pour m√©triques d'incertitude
        embedding_sims_all[i, :] = embedding_sim
        tfidf_sims_all[i, :] = tfidf_sim
        
        # 3) Combinaison pond√©r√©e
        sims[i, :] = alpha * embedding_sim + (1 - alpha) * tfidf_sim

    probs = softmax(sims, temperature=temperature)  # (n2, topk_bm25)
    
    # ============================================================
    # QUANTIFICATION D'INCERTITUDE
    # ============================================================
    
    # 1) Entropie normalis√©e de la distribution de probabilit√©s
    # Interpr√©tation: >0.7 = tr√®s incertain (probs uniformes)
    entropy = -np.sum(probs * np.log(probs + 1e-10), axis=1)
    normalized_entropy = entropy / np.log(topk_bm25)
    
    # 2) Variance des similarit√©s brutes (avant softmax)
    # Interpr√©tation: variance √©lev√©e = scores bien diff√©renci√©s (bon signal)
    #                 variance faible = tous les candidats se ressemblent (mauvais signal)
    sim_variance = np.var(sims, axis=1)
    
    # 3) D√©saccord entre embeddings et TF-IDF (corr√©lation de Spearman)
    # Interpr√©tation: >0.5 = s√©mantique et lexique pointent vers candidats diff√©rents
    print("Calcul m√©triques d'incertitude...")
    disagreement = np.zeros(n2)
    for i in range(n2):
        corr, _ = spearmanr(embedding_sims_all[i], tfidf_sims_all[i])
        disagreement[i] = 1 - corr  # 0=accord parfait, 1=d√©saccord total
    
    # ============================================================

    # topk_final parmi candidats
    top_local = np.argsort(-probs, axis=1)[:, :topk_final]             # indices 0..topk_bm25-1
    top_prob = np.take_along_axis(probs, top_local, axis=1)            # (n2, topk_final)
    top_global_idx = np.take_along_axis(cand_idx, top_local, axis=1)   # indices dans df1u
    top_prod = df1u[col_df1_key].to_numpy()[top_global_idx]            # (n2, topk_final)

    # outputs
    rows = []
    for i in range(n2):
        for r in range(topk_final):
            rows.append({
                "Nomenclature achat": df2.iloc[i]["Nomenclature achat"],
                "rank": r + 1,
                "produit_match": top_prod[i, r],
                "proba": float(top_prob[i, r]),
            })
    out_long = pd.DataFrame(rows)

    wide = {
        "Nomenclature achat": df2["Nomenclature achat"].to_numpy(),
        "uncertainty_entropy": normalized_entropy,
        "similarity_variance": sim_variance,
        "embedding_tfidf_disagreement": disagreement,
    }
    
    for r in range(topk_final):
        wide[f"top{r+1}_produit"] = top_prod[:, r]
        wide[f"top{r+1}_proba"] = top_prob[:, r]
    out_wide = pd.DataFrame(wide)

    out_wide.attrs["out_long"] = out_long
    return out_wide

# ============================================================
# 4) Utilitaires pratiques
# ============================================================

def keep_df2_columns(df2: pd.DataFrame) -> pd.DataFrame:
    # m√™mes colonnes que tu veux conserver
    keep = [
        "Nomenclature achat",
        "Cat√©gories d'achat\n(N-2)",
        "Segments  d'achat\n(N-3)",
        "Sous-segment",
        "Produit √©l√©mentaire",
        "Code des Cat√©gories Homog√®nes \nde fournitures et prestations",
    ]
    missing = [c for c in keep if c not in df2.columns]
    if missing:
        raise ValueError(f"Colonnes manquantes: {missing}\nColonnes df2: {list(df2.columns)}")
    return df2[keep].copy()


# ============================================================
# 5) Exemple d'ex√©cution
# ============================================================

if __name__ == "__main__":
    # path_df1 = r"df_composant_medical_emissions_carbones.xlsx"
    # path_df2 = r"DISPOSITIFS_MED.xlsx"

    path_df1 = r"/home/onyxia/datachallenge2026/sujets/chu/Axe_2/results/df_composant_medical_emissions_carbones.xlsx"
    path_df2 = r"/home/onyxia/datachallenge2026/sujets/chu/Axe_2/DISPOSITIFS_MED.xlsx"

    df1 = pd.read_excel(path_df1)
    df2 = load_and_select_df2(path_df2)

    # print(df1.head())
    # print(df2.head())

    # TEMPS : 1min8s
    # NLP + traduction (1min9s)
    translator = TranslatorFR2EN(device=0)  # GPU
    df1p, df2p = add_processed_columns(
        df1, df2,
        col_df1_produit="produit",
        col_df2_best="Produit √©l√©mentaire",
        translator=translator
    )


    # TEMPS : 9.7s
    # Matching BM25 -> Embeddings+TF-IDF -> Top5 
    match_wide = match_with_bm25_then_embeddings(
        df1p, df2p,
        col_df1_key="produit",
        df1_text_col="produit_en_proc",
        df2_text_col="produit_elem_en_proc",
        topk_bm25=20,         
        topk_final=5,
        embedding_model="pritamdeka/S-PubMedBert-MS-MARCO",
        temperature=0.07,
        alpha=0.70  # 70% similarit√© cos, 30% TF-IDF
        # alpha=0  # 70% similarit√© cos, 30% TF-IDF
    )

    # # Sauvegarde
    # match_wide.to_excel("/home/onyxia/datachallenge2026/sujets/chu/Axe_2/results/MATCH_df2_vers_df1_top5_simcos_et_TF_IDF.xlsx", index=False)
    # match_wide.attrs["out_long"].to_excel("/home/onyxia/datachallenge2026/sujets/chu/Axe_2/results/MATCH_df2_vers_df1_top5_long_simcos_et_TF_IDF.xlsx", index=False)

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 25.62it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 43/43 [00:04<00:00,  9.53it/s]


Calcul TF-IDF...
Calcul similarit√©s hybrides (embeddings + TF-IDF)...
Calcul m√©triques d'incertitude...


  corr, _ = spearmanr(embedding_sims_all[i], tfidf_sims_all[i])


### 3) R√©duction et √©valuation des co√ªts carbones 

Le site de [EcoLogits](https://ecologits.ai/latest/reference/tracers/utils/#tracers.utils.llm_impacts) peut √™tre utile

In [28]:
# SLM, TintBERT ?
# M√©triques et graphes de consommation carbonne par inf√©rence


# INT√âGRATION ECOLOGITS + CODECARBON
# ===================================

# Combine les deux outils pour une analyse compl√®te :
# - CodeCarbon : mesures g√©n√©rales (spaCy, TF-IDF, BM25)
# - EcoLogits : m√©triques d√©taill√©es pour LLMs (traduction, embeddings)

# Installation : pip install ecologits codecarbon


import pandas as pd
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Optional
import time

# !pip install codecarbon
from codecarbon import EmissionsTracker

# !pip install ecologits
from ecologits.tracers.utils import llm_impacts
ECOLOGITS_AVAILABLE = True




In [32]:

@dataclass
class EnhancedModelMetrics:
    """M√©triques enrichies combinant CodeCarbon + EcoLogits"""
    model_name: str
    model_type: str  # "llm" ou "classic"
    
    # M√©triques communes
    duration_s: float
    energy_kwh: float
    
    # M√©triques CodeCarbon (tous mod√®les)
    co2_kg: float
    cpu_power_w: float = 0.0
    gpu_power_w: float = 0.0
    ram_power_w: float = 0.0
    
    # M√©triques EcoLogits (LLMs uniquement)
    gwp_kg_co2eq: Optional[float] = None      # Global Warming Potential
    adpe_kg_sb_eq: Optional[float] = None     # Abiotic Depletion (m√©taux)
    pe_mj: Optional[float] = None             # Primary Energy
    wcf_liters: Optional[float] = None        # Water Consumption
    usage_gwp: Optional[float] = None         # GWP usage seulement
    embodied_gwp: Optional[float] = None      # GWP embodied seulement
    
    # M√©tadonn√©es LLM
    tokens_processed: Optional[int] = None
    latency_per_token_ms: Optional[float] = None
    
    def to_dict(self) -> Dict:
        base = {
            "Mod√®le": self.model_name,
            "Type": self.model_type,
            "Dur√©e (s)": round(self.duration_s, 2),
            "√ânergie (kWh)": round(self.energy_kwh, 6),
            "CO2 CodeCarbon (kg)": round(self.co2_kg, 6),
        }
        
        # Ajout m√©triques EcoLogits si disponibles
        if self.model_type == "llm" and self.gwp_kg_co2eq is not None:
            base.update({
                "GWP total (kg CO2eq)": round(self.gwp_kg_co2eq, 6),
                "GWP usage (kg)": round(self.usage_gwp or 0, 6),
                "GWP embodied (kg)": round(self.embodied_gwp or 0, 6),
                "ADPe (kg Sb eq)": round(self.adpe_kg_sb_eq or 0, 9),
                "√ânergie primaire (MJ)": round(self.pe_mj or 0, 3),
                "Eau (litres)": round(self.wcf_liters or 0, 3),
                "Tokens trait√©s": self.tokens_processed or 0,
                "Latence/token (ms)": round(self.latency_per_token_ms or 0, 2),
            })
        
        return base


class HybridCarbonBenchmark:
    """
    Benchmark hybride utilisant CodeCarbon ET EcoLogits
    
    Usage:
        bench = HybridCarbonBenchmark()
        
        # Mod√®le classique (spaCy)
        with bench.track_classic("spacy_fr"):
            nlp = spacy.load("fr_core_news_md")
            docs = list(nlp.pipe(texts))
        
        # Mod√®le LLM (traduction)
        with bench.track_llm("translation", provider="huggingface", model="Helsinki-NLP/opus-mt-fr-en"):
            outputs = translator(texts)
    """
    
    def __init__(self, country_code: str = "FRA", project_name: str = "medical_matching"):
        self.country_code = country_code
        self.project_name = project_name
        self.results: List[EnhancedModelMetrics] = []
        
        if not ECOLOGITS_AVAILABLE:
            print("‚ö†Ô∏è EcoLogits non disponible - m√©triques limit√©es √† CodeCarbon")
    
    def track_classic(self, model_name: str):
        """Track un mod√®le classique (spaCy, sklearn, etc.) avec CodeCarbon"""
        return _ClassicModelTracker(self, model_name)
    
    def track_llm(
        self, 
        model_name: str,
        provider: str = "huggingface",
        model_id: str = None,
        electricity_mix_zone: str = None
    ):
        """Track un LLM avec CodeCarbon + EcoLogits"""
        return _LLMTracker(self, model_name, provider, model_id, electricity_mix_zone)
    
    def add_result(self, metrics: EnhancedModelMetrics):
        self.results.append(metrics)
    
    def get_dataframe(self) -> pd.DataFrame:
        if not self.results:
            return pd.DataFrame()
        return pd.DataFrame([r.to_dict() for r in self.results])
    
    def print_summary(self):
        if not self.results:
            print("Aucune mesure disponible")
            return
        
        df = self.get_dataframe()
        
        print("\n" + "="*100)
        print(f"üìä RAPPORT CARBONE HYBRIDE - Projet: {self.project_name}")
        print("="*100)
        
        # S√©paration LLM vs Classic
        llm_results = [r for r in self.results if r.model_type == "llm"]
        classic_results = [r for r in self.results if r.model_type == "classic"]
        
        print(f"\nüî¨ Mod√®les classiques: {len(classic_results)}")
        print(f"ü§ñ Mod√®les LLM: {len(llm_results)}")
        
        # Total CodeCarbon
        total_co2_cc = sum(r.co2_kg for r in self.results)
        total_energy = sum(r.energy_kwh for r in self.results)
        
        print(f"\n‚ö° √ânergie totale (CodeCarbon): {total_energy:.6f} kWh")
        print(f"üè≠ CO2 total (CodeCarbon): {total_co2_cc:.6f} kg")
        
        # Si EcoLogits disponible, afficher m√©triques enrichies
        if llm_results and llm_results[0].gwp_kg_co2eq is not None:
            total_gwp = sum(r.gwp_kg_co2eq or 0 for r in llm_results)
            total_adpe = sum(r.adpe_kg_sb_eq or 0 for r in llm_results)
            total_pe = sum(r.pe_mj or 0 for r in llm_results)
            total_water = sum(r.wcf_liters or 0 for r in llm_results)
            
            print("\n" + "-"*100)
            print("üåç M√âTRIQUES ENRICHIES ECOLOGITS (LLMs uniquement)")
            print("-"*100)
            print(f"‚Ä¢ GWP total: {total_gwp:.6f} kg CO2eq")
            print(f"  ‚îú‚îÄ Usage: {sum(r.usage_gwp or 0 for r in llm_results):.6f} kg")
            print(f"  ‚îî‚îÄ Embodied: {sum(r.embodied_gwp or 0 for r in llm_results):.6f} kg")
            print(f"‚Ä¢ ADPe (√©puisement m√©taux): {total_adpe:.9f} kg Sb eq")
            print(f"‚Ä¢ √ânergie primaire: {total_pe:.3f} MJ")
            print(f"‚Ä¢ Consommation d'eau: {total_water:.3f} litres")
            
            # √âquivalences eau
            print(f"\nüíß √âquivalent eau:")
            print(f"  ‚Ä¢ {total_water / 0.25:.0f} verres d'eau (250ml)")
            print(f"  ‚Ä¢ {total_water / 8:.1f} douches (8L/min pendant 1min)")
        
        print("\n" + "-"*100)
        print("D√©tail par mod√®le:")
        print("-"*100)
        print(df.to_string(index=False))
        
        # √âquivalences carbone
        print("\n" + "="*100)
        print("üå≥ √âQUIVALENCES CARBONE")
        print("="*100)
        km_voiture = total_co2_cc / 0.12
        arbres_an = total_co2_cc / 21
        smartphones = total_energy * 1000 / 0.012
        
        print(f"‚Ä¢ {km_voiture:.1f} km en voiture")
        print(f"‚Ä¢ {arbres_an:.2f} arbres pendant 1 an pour compenser")
        print(f"‚Ä¢ {smartphones:.0f} charges de smartphone")
    
    def save_results(self, filepath: str):
        df = self.get_dataframe()
        if df.empty:
            print("Aucun r√©sultat √† sauvegarder")
            return
        
        with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name="M√©triques compl√®tes", index=False)
            
            # Onglet comparaison LLM vs Classic
            comparison = pd.DataFrame({
                "Type": ["Mod√®les classiques", "Mod√®les LLM", "TOTAL"],
                "Nombre": [
                    len([r for r in self.results if r.model_type == "classic"]),
                    len([r for r in self.results if r.model_type == "llm"]),
                    len(self.results)
                ],
                "CO2 (kg)": [
                    sum(r.co2_kg for r in self.results if r.model_type == "classic"),
                    sum(r.co2_kg for r in self.results if r.model_type == "llm"),
                    sum(r.co2_kg for r in self.results)
                ],
                "√ânergie (kWh)": [
                    sum(r.energy_kwh for r in self.results if r.model_type == "classic"),
                    sum(r.energy_kwh for r in self.results if r.model_type == "llm"),
                    sum(r.energy_kwh for r in self.results)
                ]
            })
            comparison.to_excel(writer, sheet_name="LLM vs Classic", index=False)
        
        print(f"‚úÖ R√©sultats sauvegard√©s : {filepath}")


class _ClassicModelTracker:
    """Tracker pour mod√®les classiques (CodeCarbon uniquement)"""
    
    def __init__(self, benchmark: HybridCarbonBenchmark, model_name: str):
        self.benchmark = benchmark
        self.model_name = model_name
        self.tracker = None
        self.start_time = None
        
    def __enter__(self):
        self.start_time = time.time()
        self.tracker = EmissionsTracker(
            project_name=self.benchmark.project_name,
            country_2letter_iso_code=self.benchmark.country_code,
            log_level="WARNING",
        )
        self.tracker.start()
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        emissions = self.tracker.stop()
        duration = time.time() - self.start_time
        
        metrics = EnhancedModelMetrics(
            model_name=self.model_name,
            model_type="classic",
            duration_s=duration,
            energy_kwh=self.tracker._total_energy.kWh if hasattr(self.tracker, '_total_energy') else 0,
            co2_kg=emissions if emissions else 0,
            cpu_power_w=self.tracker._cpu_power.W if hasattr(self.tracker, '_cpu_power') else 0,
            gpu_power_w=self.tracker._gpu_power.W if hasattr(self.tracker, '_gpu_power') else 0,
            ram_power_w=self.tracker._ram_power.W if hasattr(self.tracker, '_ram_power') else 0,
        )
        
        self.benchmark.add_result(metrics)


class _LLMTracker:
    """Tracker pour LLMs (CodeCarbon + EcoLogits)"""
    
    def __init__(
        self, 
        benchmark: HybridCarbonBenchmark, 
        model_name: str,
        provider: str,
        model_id: str,
        electricity_mix_zone: str
    ):
        self.benchmark = benchmark
        self.model_name = model_name
        self.provider = provider
        self.model_id = model_id or model_name
        self.electricity_mix_zone = electricity_mix_zone or benchmark.country_code
        self.tracker = None
        self.start_time = None
        self.token_count = 0
        
    def __enter__(self):
        self.start_time = time.time()
        self.tracker = EmissionsTracker(
            project_name=self.benchmark.project_name,
            country_2letter_iso_code=self.benchmark.country_code,
            log_level="WARNING",
        )
        self.tracker.start()
        return self
    
    def set_token_count(self, count: int):
        """Permet de d√©finir le nombre de tokens trait√©s"""
        self.token_count = count
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        emissions_cc = self.tracker.stop()
        duration = time.time() - self.start_time
        
        # M√©triques CodeCarbon
        energy_kwh = self.tracker._total_energy.kWh if hasattr(self.tracker, '_total_energy') else 0
        
        # M√©triques EcoLogits (si disponible)
        gwp = adpe = pe = wcf = usage_gwp = embodied_gwp = None
        latency_per_token = None
        
        if ECOLOGITS_AVAILABLE and self.token_count > 0:
            try:
                impacts = llm_impacts(
                    provider=self.provider,
                    model_name=self.model_id,
                    output_token_count=self.token_count,
                    request_latency=duration,
                    electricity_mix_zone=self.electricity_mix_zone,
                )
                
                if impacts.gwp:
                    gwp = impacts.gwp.value
                    usage_gwp = impacts.usage.gwp.value if impacts.usage and impacts.usage.gwp else None
                    embodied_gwp = impacts.embodied.gwp.value if impacts.embodied and impacts.embodied.gwp else None
                
                adpe = impacts.adpe.value if impacts.adpe else None
                pe = impacts.pe.value if impacts.pe else None
                wcf = impacts.wcf.value if impacts.wcf else None
                latency_per_token = (duration / self.token_count) * 1000  # ms
                
            except Exception as e:
                print(f"‚ö†Ô∏è Erreur EcoLogits pour {self.model_name}: {e}")
        
        metrics = EnhancedModelMetrics(
            model_name=self.model_name,
            model_type="llm",
            duration_s=duration,
            energy_kwh=energy_kwh,
            co2_kg=emissions_cc if emissions_cc else 0,
            gwp_kg_co2eq=gwp,
            adpe_kg_sb_eq=adpe,
            pe_mj=pe,
            wcf_liters=wcf,
            usage_gwp=usage_gwp,
            embodied_gwp=embodied_gwp,
            tokens_processed=self.token_count if self.token_count > 0 else None,
            latency_per_token_ms=latency_per_token,
        )
        
        self.benchmark.add_result(metrics)


# ============================================================
# EXEMPLE D'UTILISATION AVEC TON PIPELINE
# ============================================================

if __name__ == "__main__":
    """
    Exemple d'int√©gration compl√®te dans ton pipeline de matching
    """
    
    bench = HybridCarbonBenchmark(
        country_code="FRA",
        project_name="medical_nomenclature_matching"
    )
    
    # 1) spaCy (classique)
    print("üìä Mesure spaCy fran√ßais...")
    with bench.track_classic("spaCy_fr_core_news_md"):
        import spacy
        nlp_fr = spacy.load("fr_core_news_md", disable=["ner", "parser"])
        # texts_fr = ["exemple"] * 1000
        # docs = list(nlp_fr.pipe(texts_fr, batch_size=256))
    
    # 2) spaCy anglais (classique)
    print("üìä Mesure spaCy anglais...")
    with bench.track_classic("spaCy_en_core_web_sm"):
        nlp_en = spacy.load("en_core_web_sm", disable=["ner", "parser"])
        # docs = list(nlp_en.pipe(texts_en, batch_size=256))
    
    # 3) Traduction (LLM)
    print("üìä Mesure traduction...")
    tracker_translation = bench.track_llm(
        model_name="Helsinki_opus-mt-fr-en",
        provider="huggingface",
        model_id="Helsinki-NLP/opus-mt-fr-en",
        electricity_mix_zone="FRA"
    )
    
    with tracker_translation:
        from transformers import pipeline
        translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=0)
        # texts_fr = ["exemple"] * 1200
        # outputs = translator(texts_fr, batch_size=16, truncation=True)
        
        # Compter les tokens g√©n√©r√©s (approximation)
        # token_count = sum(len(o["translation_text"].split()) for o in outputs)
        token_count = 1200 * 15  # Approximation : 15 tokens/texte
        tracker_translation.set_token_count(token_count)
    
    # 4) Embeddings (LLM)
    print("üìä Mesure embeddings...")
    tracker_embeddings = bench.track_llm(
        model_name="S-PubMedBert-MS-MARCO",
        provider="huggingface",
        model_id="pritamdeka/S-PubMedBert-MS-MARCO",
        electricity_mix_zone="FRA"
    )
    
    with tracker_embeddings:
        from sentence_transformers import SentenceTransformer
        model_emb = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO")
        # texts = ["exemple"] * 6200
        # embeddings = model_emb.encode(texts, normalize_embeddings=True, batch_size=64)
        
        # Approximation tokens (BERT = ~1.3 token/mot)
        token_count = 6200 * 10 * 1.3  # 6200 textes √ó 10 mots √ó 1.3
        tracker_embeddings.set_token_count(int(token_count))
    
    # 5) TF-IDF + BM25 (classique)
    print("üìä Mesure TF-IDF + BM25...")
    with bench.track_classic("TF-IDF_BM25_Reranking"):
        from sklearn.feature_extraction.text import TfidfVectorizer
        # vectorizer = TfidfVectorizer(max_features=5000)
        # tfidf_matrix = vectorizer.fit_transform(texts)
        # Ton code BM25 + reranking
        pass
    
    # R√©sultats
    bench.print_summary()
    bench.save_results("carbon_footprint_hybrid_report.xlsx")

üìä Mesure spaCy fran√ßais...


TypeError: BaseEmissionsTracker.__init__() got an unexpected keyword argument 'country_2letter_iso_code'

In [35]:
"""
INT√âGRATION ECOLOGITS + CODECARBON
===================================

Combine les deux outils pour une analyse compl√®te :
- CodeCarbon : mesures g√©n√©rales (spaCy, TF-IDF, BM25)
- EcoLogits : m√©triques d√©taill√©es pour LLMs (traduction, embeddings)

Installation : pip install ecologits codecarbon
"""

import pandas as pd
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Optional
import time
from codecarbon import EmissionsTracker

try:
    from ecologits.tracers.utils import llm_impacts
    ECOLOGITS_AVAILABLE = True
except ImportError:
    ECOLOGITS_AVAILABLE = False
    print("‚ö†Ô∏è EcoLogits non disponible. Installez avec : pip install ecologits")


@dataclass
class EnhancedModelMetrics:
    """M√©triques enrichies combinant CodeCarbon + EcoLogits"""
    model_name: str
    model_type: str  # "llm" ou "classic"
    
    # M√©triques communes
    duration_s: float
    energy_kwh: float
    
    # M√©triques CodeCarbon (tous mod√®les)
    co2_kg: float
    cpu_power_w: float = 0.0
    gpu_power_w: float = 0.0
    ram_power_w: float = 0.0
    
    # M√©triques EcoLogits (LLMs uniquement)
    gwp_kg_co2eq: Optional[float] = None      # Global Warming Potential
    adpe_kg_sb_eq: Optional[float] = None     # Abiotic Depletion (m√©taux)
    pe_mj: Optional[float] = None             # Primary Energy
    wcf_liters: Optional[float] = None        # Water Consumption
    usage_gwp: Optional[float] = None         # GWP usage seulement
    embodied_gwp: Optional[float] = None      # GWP embodied seulement
    
    # M√©tadonn√©es LLM
    tokens_processed: Optional[int] = None
    latency_per_token_ms: Optional[float] = None
    
    def to_dict(self) -> Dict:
        base = {
            "Mod√®le": self.model_name,
            "Type": self.model_type,
            "Dur√©e (s)": round(self.duration_s, 2),
            "√ânergie (kWh)": round(self.energy_kwh, 6),
            "CO2 CodeCarbon (kg)": round(self.co2_kg, 6),
        }
        
        # Ajout m√©triques EcoLogits si disponibles
        if self.model_type == "llm" and self.gwp_kg_co2eq is not None:
            base.update({
                "GWP total (kg CO2eq)": round(self.gwp_kg_co2eq, 6),
                "GWP usage (kg)": round(self.usage_gwp or 0, 6),
                "GWP embodied (kg)": round(self.embodied_gwp or 0, 6),
                "ADPe (kg Sb eq)": round(self.adpe_kg_sb_eq or 0, 9),
                "√ânergie primaire (MJ)": round(self.pe_mj or 0, 3),
                "Eau (litres)": round(self.wcf_liters or 0, 3),
                "Tokens trait√©s": self.tokens_processed or 0,
                "Latence/token (ms)": round(self.latency_per_token_ms or 0, 2),
            })
        
        return base


class HybridCarbonBenchmark:
    """
    Benchmark hybride utilisant CodeCarbon ET EcoLogits
    
    Usage:
        bench = HybridCarbonBenchmark()
        
        # Mod√®le classique (spaCy)
        with bench.track_classic("spacy_fr"):
            nlp = spacy.load("fr_core_news_md")
            docs = list(nlp.pipe(texts))
        
        # Mod√®le LLM (traduction)
        with bench.track_llm("translation", provider="huggingface", model="Helsinki-NLP/opus-mt-fr-en"):
            outputs = translator(texts)
    """
    
    def __init__(self, country_code: str = "FRA", project_name: str = "medical_matching"):
        self.country_code = country_code
        self.project_name = project_name
        self.results: List[EnhancedModelMetrics] = []
        
        if not ECOLOGITS_AVAILABLE:
            print("‚ö†Ô∏è EcoLogits non disponible - m√©triques limit√©es √† CodeCarbon")
    
    def track_classic(self, model_name: str):
        """Track un mod√®le classique (spaCy, sklearn, etc.) avec CodeCarbon"""
        return _ClassicModelTracker(self, model_name)
    
    def track_llm(
        self, 
        model_name: str,
        provider: str = "huggingface",
        model_id: str = None,
        electricity_mix_zone: str = None
    ):
        """Track un LLM avec CodeCarbon + EcoLogits"""
        return _LLMTracker(self, model_name, provider, model_id, electricity_mix_zone)
    
    def add_result(self, metrics: EnhancedModelMetrics):
        self.results.append(metrics)
    
    def get_dataframe(self) -> pd.DataFrame:
        if not self.results:
            return pd.DataFrame()
        return pd.DataFrame([r.to_dict() for r in self.results])
    
    def print_summary(self):
        if not self.results:
            print("Aucune mesure disponible")
            return
        
        df = self.get_dataframe()
        
        print("\n" + "="*100)
        print(f"üìä RAPPORT CARBONE HYBRIDE - Projet: {self.project_name}")
        print("="*100)
        
        # S√©paration LLM vs Classic
        llm_results = [r for r in self.results if r.model_type == "llm"]
        classic_results = [r for r in self.results if r.model_type == "classic"]
        
        print(f"\nüî¨ Mod√®les classiques: {len(classic_results)}")
        print(f"ü§ñ Mod√®les LLM: {len(llm_results)}")
        
        # Total CodeCarbon
        total_co2_cc = sum(r.co2_kg for r in self.results)
        total_energy = sum(r.energy_kwh for r in self.results)
        
        print(f"\n‚ö° √ânergie totale (CodeCarbon): {total_energy:.6f} kWh")
        print(f"üè≠ CO2 total (CodeCarbon): {total_co2_cc:.6f} kg")
        
        # Si EcoLogits disponible, afficher m√©triques enrichies
        if llm_results and llm_results[0].gwp_kg_co2eq is not None:
            total_gwp = sum(r.gwp_kg_co2eq or 0 for r in llm_results)
            total_adpe = sum(r.adpe_kg_sb_eq or 0 for r in llm_results)
            total_pe = sum(r.pe_mj or 0 for r in llm_results)
            total_water = sum(r.wcf_liters or 0 for r in llm_results)
            
            print("\n" + "-"*100)
            print("üåç M√âTRIQUES ENRICHIES ECOLOGITS (LLMs uniquement)")
            print("-"*100)
            print(f"‚Ä¢ GWP total: {total_gwp:.6f} kg CO2eq")
            print(f"  ‚îú‚îÄ Usage: {sum(r.usage_gwp or 0 for r in llm_results):.6f} kg")
            print(f"  ‚îî‚îÄ Embodied: {sum(r.embodied_gwp or 0 for r in llm_results):.6f} kg")
            print(f"‚Ä¢ ADPe (√©puisement m√©taux): {total_adpe:.9f} kg Sb eq")
            print(f"‚Ä¢ √ânergie primaire: {total_pe:.3f} MJ")
            print(f"‚Ä¢ Consommation d'eau: {total_water:.3f} litres")
            
            # √âquivalences eau
            print(f"\nüíß √âquivalent eau:")
            print(f"  ‚Ä¢ {total_water / 0.25:.0f} verres d'eau (250ml)")
            print(f"  ‚Ä¢ {total_water / 8:.1f} douches (8L/min pendant 1min)")
        
        print("\n" + "-"*100)
        print("D√©tail par mod√®le:")
        print("-"*100)
        print(df.to_string(index=False))
        
        # √âquivalences carbone
        print("\n" + "="*100)
        print("üå≥ √âQUIVALENCES CARBONE")
        print("="*100)
        km_voiture = total_co2_cc / 0.12
        arbres_an = total_co2_cc / 21
        smartphones = total_energy * 1000 / 0.012
        
        print(f"‚Ä¢ {km_voiture:.1f} km en voiture")
        print(f"‚Ä¢ {arbres_an:.2f} arbres pendant 1 an pour compenser")
        print(f"‚Ä¢ {smartphones:.0f} charges de smartphone")
    
    def save_results(self, filepath: str):
        df = self.get_dataframe()
        if df.empty:
            print("Aucun r√©sultat √† sauvegarder")
            return
        
        with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name="M√©triques compl√®tes", index=False)
            
            # Onglet comparaison LLM vs Classic
            comparison = pd.DataFrame({
                "Type": ["Mod√®les classiques", "Mod√®les LLM", "TOTAL"],
                "Nombre": [
                    len([r for r in self.results if r.model_type == "classic"]),
                    len([r for r in self.results if r.model_type == "llm"]),
                    len(self.results)
                ],
                "CO2 (kg)": [
                    sum(r.co2_kg for r in self.results if r.model_type == "classic"),
                    sum(r.co2_kg for r in self.results if r.model_type == "llm"),
                    sum(r.co2_kg for r in self.results)
                ],
                "√ânergie (kWh)": [
                    sum(r.energy_kwh for r in self.results if r.model_type == "classic"),
                    sum(r.energy_kwh for r in self.results if r.model_type == "llm"),
                    sum(r.energy_kwh for r in self.results)
                ]
            })
            comparison.to_excel(writer, sheet_name="LLM vs Classic", index=False)
        
        print(f"‚úÖ R√©sultats sauvegard√©s : {filepath}")


class _ClassicModelTracker:
    """Tracker pour mod√®les classiques (CodeCarbon uniquement)"""
    
    def __init__(self, benchmark: HybridCarbonBenchmark, model_name: str):
        self.benchmark = benchmark
        self.model_name = model_name
        self.tracker = None
        self.start_time = None
        
    def __enter__(self):
        self.start_time = time.time()
        # CodeCarbon 3.x utilise country_2letter_iso_code (pas country_iso_code)
        country_code_2letter = self.benchmark.country_code[:2] if len(self.benchmark.country_code) > 2 else self.benchmark.country_code
        self.tracker = EmissionsTracker(
            project_name=self.benchmark.project_name,
            # country_2letter_iso_code=country_code_2letter,
            log_level="warning",
        )
        self.tracker.start()
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        emissions = self.tracker.stop()
        duration = time.time() - self.start_time
        
        metrics = EnhancedModelMetrics(
            model_name=self.model_name,
            model_type="classic",
            duration_s=duration,
            energy_kwh=self.tracker._total_energy.kWh if hasattr(self.tracker, '_total_energy') else 0,
            co2_kg=emissions if emissions else 0,
            cpu_power_w=self.tracker._cpu_power.W if hasattr(self.tracker, '_cpu_power') else 0,
            gpu_power_w=self.tracker._gpu_power.W if hasattr(self.tracker, '_gpu_power') else 0,
            ram_power_w=self.tracker._ram_power.W if hasattr(self.tracker, '_ram_power') else 0,
        )
        
        self.benchmark.add_result(metrics)


class _LLMTracker:
    """Tracker pour LLMs (CodeCarbon + EcoLogits)"""
    
    def __init__(
        self, 
        benchmark: HybridCarbonBenchmark, 
        model_name: str,
        provider: str,
        model_id: str,
        electricity_mix_zone: str
    ):
        self.benchmark = benchmark
        self.model_name = model_name
        self.provider = provider
        self.model_id = model_id or model_name
        self.electricity_mix_zone = electricity_mix_zone or benchmark.country_code
        self.tracker = None
        self.start_time = None
        self.token_count = 0
        
    def __enter__(self):
        self.start_time = time.time()
        self.tracker = EmissionsTracker(
            project_name=self.benchmark.project_name,
            # country_iso_code=self.benchmark.country_code,
            log_level="WARNING",
        )
        self.tracker.start()
        return self
    
    def set_token_count(self, count: int):
        """Permet de d√©finir le nombre de tokens trait√©s"""
        self.token_count = count
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        emissions_cc = self.tracker.stop()
        duration = time.time() - self.start_time
        
        # M√©triques CodeCarbon
        energy_kwh = self.tracker._total_energy.kWh if hasattr(self.tracker, '_total_energy') else 0
        
        # M√©triques EcoLogits (si disponible)
        gwp = adpe = pe = wcf = usage_gwp = embodied_gwp = None
        latency_per_token = None
        
        if ECOLOGITS_AVAILABLE and self.token_count > 0:
            try:
                impacts = llm_impacts(
                    provider=self.provider,
                    model_name=self.model_id,
                    output_token_count=self.token_count,
                    request_latency=duration,
                    electricity_mix_zone=self.electricity_mix_zone,
                )
                
                if impacts.gwp:
                    gwp = impacts.gwp.value
                    usage_gwp = impacts.usage.gwp.value if impacts.usage and impacts.usage.gwp else None
                    embodied_gwp = impacts.embodied.gwp.value if impacts.embodied and impacts.embodied.gwp else None
                
                adpe = impacts.adpe.value if impacts.adpe else None
                pe = impacts.pe.value if impacts.pe else None
                wcf = impacts.wcf.value if impacts.wcf else None
                latency_per_token = (duration / self.token_count) * 1000  # ms
                
            except Exception as e:
                print(f"‚ö†Ô∏è Erreur EcoLogits pour {self.model_name}: {e}")
        
        metrics = EnhancedModelMetrics(
            model_name=self.model_name,
            model_type="llm",
            duration_s=duration,
            energy_kwh=energy_kwh,
            co2_kg=emissions_cc if emissions_cc else 0,
            gwp_kg_co2eq=gwp,
            adpe_kg_sb_eq=adpe,
            pe_mj=pe,
            wcf_liters=wcf,
            usage_gwp=usage_gwp,
            embodied_gwp=embodied_gwp,
            tokens_processed=self.token_count if self.token_count > 0 else None,
            latency_per_token_ms=latency_per_token,
        )
        
        self.benchmark.add_result(metrics)


# ============================================================
# EXEMPLE D'UTILISATION AVEC TON PIPELINE
# ============================================================

if __name__ == "__main__":
    """
    Exemple d'int√©gration compl√®te dans ton pipeline de matching
    """
    
    bench = HybridCarbonBenchmark(
        country_code="FRA",
        project_name="medical_nomenclature_matching"
    )
    
    # 1) spaCy (classique)
    print("üìä Mesure spaCy fran√ßais...")
    with bench.track_classic("spaCy_fr_core_news_md"):
        import spacy
        nlp_fr = spacy.load("fr_core_news_md", disable=["ner", "parser"])
        # texts_fr = ["exemple"] * 1000
        # docs = list(nlp_fr.pipe(texts_fr, batch_size=256))
    
    # 2) spaCy anglais (classique)
    print("üìä Mesure spaCy anglais...")
    with bench.track_classic("spaCy_en_core_web_sm"):
        nlp_en = spacy.load("en_core_web_sm", disable=["ner", "parser"])
        # docs = list(nlp_en.pipe(texts_en, batch_size=256))
    
    # 3) Traduction (LLM)
    print("üìä Mesure traduction...")
    tracker_translation = bench.track_llm(
        model_name="Helsinki_opus-mt-fr-en",
        provider="huggingface",
        model_id="Helsinki-NLP/opus-mt-fr-en",
        electricity_mix_zone="FRA"
    )
    
    with tracker_translation:
        from transformers import pipeline
        translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=0)
        # texts_fr = ["exemple"] * 1200
        # outputs = translator(texts_fr, batch_size=16, truncation=True)
        
        # Compter les tokens g√©n√©r√©s (approximation)
        # token_count = sum(len(o["translation_text"].split()) for o in outputs)
        token_count = 1200 * 15  # Approximation : 15 tokens/texte
        tracker_translation.set_token_count(token_count)
    
    # 4) Embeddings (LLM)
    print("üìä Mesure embeddings...")
    tracker_embeddings = bench.track_llm(
        model_name="S-PubMedBert-MS-MARCO",
        provider="huggingface",
        model_id="pritamdeka/S-PubMedBert-MS-MARCO",
        electricity_mix_zone="FRA"
    )
    
    with tracker_embeddings:
        from sentence_transformers import SentenceTransformer
        model_emb = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO")
        # texts = ["exemple"] * 6200
        # embeddings = model_emb.encode(texts, normalize_embeddings=True, batch_size=64)
        
        # Approximation tokens (BERT = ~1.3 token/mot)
        token_count = 6200 * 10 * 1.3  # 6200 textes √ó 10 mots √ó 1.3
        tracker_embeddings.set_token_count(int(token_count))
    
    # 5) TF-IDF + BM25 (classique)
    print("üìä Mesure TF-IDF + BM25...")
    with bench.track_classic("TF-IDF_BM25_Reranking"):
        from sklearn.feature_extraction.text import TfidfVectorizer
        # vectorizer = TfidfVectorizer(max_features=5000)
        # tfidf_matrix = vectorizer.fit_transform(texts)
        # Ton code BM25 + reranking
        pass
    
    # R√©sultats
    bench.print_summary()
    bench.save_results("carbon_footprint_hybrid_report.xlsx")



 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU



üìä Mesure spaCy fran√ßais...


 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU



üìä Mesure spaCy anglais...


 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU



üìä Mesure traduction...


Device set to use cuda:0
 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU



üìä Mesure embeddings...


 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU



üìä Mesure TF-IDF + BM25...

üìä RAPPORT CARBONE HYBRIDE - Projet: medical_nomenclature_matching

üî¨ Mod√®les classiques: 3
ü§ñ Mod√®les LLM: 2

‚ö° √ânergie totale (CodeCarbon): 0.000275 kWh
üè≠ CO2 total (CodeCarbon): 0.000015 kg

----------------------------------------------------------------------------------------------------
D√©tail par mod√®le:
----------------------------------------------------------------------------------------------------
                Mod√®le    Type  Dur√©e (s)  √ânergie (kWh)  CO2 CodeCarbon (kg)
 spaCy_fr_core_news_md classic       5.72       0.000072             0.000004
  spaCy_en_core_web_sm classic       4.72       0.000035             0.000002
Helsinki_opus-mt-fr-en     llm       5.18       0.000052             0.000003
 S-PubMedBert-MS-MARCO     llm       6.37       0.000097             0.000005
 TF-IDF_BM25_Reranking classic       4.28       0.000019             0.000001

üå≥ √âQUIVALENCES CARBONE
‚Ä¢ 0.0 km en voiture
‚Ä¢ 0.00 arbres p

In [31]:
import codecarbon
print(codecarbon.__version__)


3.2.1
