In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m179.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m93.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
from Bio import Entrez
import pandas as pd
import time

# ✅ Identifiant requis pour utiliser l'API de la NCBI
Entrez.email = "amizmizhabiba6@gmail.com"  # Remplace par ton vrai email

# ✅ Fonction pour interroger PubMed
def get_pubmed_abstracts(query, max_results=80):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    ids = record["IdList"]

    articles = []
    for pubmed_id in ids:
        try:
            fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="abstract", retmode="text")
            abstract = fetch_handle.read()
            articles.append((pubmed_id, abstract))
            time.sleep(0.5)  # Pour éviter de surcharger l’API
        except Exception as e:
            print(f"Erreur pour l'ID {pubmed_id} : {e}")
    return articles

# ✅ Requêtes associées aux catégories médicales
queries = {
    "Disease": "human diseases",
    "Symptom": "disease symptoms",
    "Gene": "human gene",
    "Protein": "human protein"
}

# ✅ Extraction des données
data = []

for category, query in queries.items():
    print(f"🔎 Recherche en cours : {query}")
    articles = get_pubmed_abstracts(query, max_results=80)
    for pmid, abstract in articles:
        data.append({"PMID": pmid, "Category": category, "Abstract": abstract.strip()})

# ✅ Enregistrement dans un fichier CSV
df = pd.DataFrame(data)
df.to_csv("corpus_medical.csv", index=False)
print("✅ Corpus médical enregistré dans 'corpus_medical.csv'")


🔎 Recherche en cours : human diseases
🔎 Recherche en cours : disease symptoms
🔎 Recherche en cours : human gene
🔎 Recherche en cours : human protein
✅ Corpus médical enregistré dans 'corpus_medical.csv'


In [None]:
# ✅ Compter directement depuis le DataFrame df
count_by_category = df['Category'].value_counts()
print("📊 Nombre de corpus par catégorie :")
print(count_by_category)


📊 Nombre de corpus par catégorie :
Category
Disease    80
Symptom    80
Gene       80
Protein    80
Name: count, dtype: int64


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import re
import string

# Stopwords anglais de base + adverbes courants ajoutés manuellement
stop_words = set([
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any",
    "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below",
    "between", "both", "but", "by", "could", "couldn't", "did", "didn't", "do", "does",
    "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further",
    "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll",
    "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how",
    "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it",
    "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself",
    "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our",
    "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll",
    "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the",
    "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they",
    "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too",
    "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're",
    "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's",
    "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would",
    "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours",
    "yourself", "yourselves",

    # Adverbes courants ajoutés
    "furthermore", "additionally", "however", "moreover", "nevertheless",
    "thus", "therefore", "hence", "indeed", "still", "nonetheless",
    "eventually", "usually", "generally", "specifically", "particularly",
    "simply", "quickly", "slowly", "rapidly", "always", "sometimes",
    "often", "rarely", "seldom", "hardly", "nearly", "barely", "easily"
])

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

# Fichier des gènes
gene_file = "/content/drive/MyDrive/hafsa_nlp_projet/unique_gene_symbols.txt"
with open(gene_file, "r", encoding="utf-8") as f:
    gene_list = set(line.strip().upper() for line in f if line.strip())

def extract_genes(text, gene_set):
    tokens = re.findall(r'\b\w+\b', text.upper())
    return {token for token in tokens if token in gene_set and token.lower() not in stop_words}

# Fichier des protéines
protein_file = "/content/drive/MyDrive/hafsa_nlp_projet/noms_proteines.txt"
with open(protein_file, "r", encoding="utf-8") as f:
    protein_names = set(line.strip().lower() for line in f if line.strip())

def extract_proteins(text):
    valid = set()
    matches1 = re.findall(r'\bprotein\s+(\w+)', text, re.IGNORECASE)
    matches2 = re.findall(r'(\w+)\s+protein\b', text, re.IGNORECASE)
    for m in matches1 + matches2:
        if m.lower() not in stop_words:
            valid.add(m.lower())
    from_list = {p for p in protein_names if p in text.lower() and p not in stop_words}
    return valid.union(from_list)

# Fichier des maladies
keywords = ["disease", "cancer", "disorder", "syndrome", "infection", "illness", "condition",
            "injury", "tumor", "tumour", "carcinoma", "neoplasm", "pathology", "autoimmune",
            "inflammatory", "genetic", "viral", "bacterial", "chronic", "arthritis", "diabetes"]
disease_pattern = re.compile(r"\b(?:%s)\b" % "|".join(re.escape(k) for k in keywords), re.IGNORECASE)
def extract_diseases(text):
    return {m for m in disease_pattern.findall(text) if m.lower() not in stop_words}

# Fichier des symptômes
symptom_file = "/content/drive/MyDrive/hafsa_nlp_projet/symptoms_list_merged.txt"
with open(symptom_file, "r", encoding="utf-8") as f:
    symptoms = set(line.strip().lower() for line in f if line.strip())

def extract_symptoms(text):
    return {s for s in symptoms if s in text.lower() and s not in stop_words}


In [None]:
import pandas as pd

# Charger les abstracts
df = pd.read_csv("/content/corpus_medical.csv")

# Vérifie qu'il y a bien une colonne "abstract"
if "Abstract" not in df.columns:
    raise ValueError("Le fichier doit contenir une colonne 'abstract'.")

# Appliquer le nettoyage + extraction des entités à chaque ligne
df["cleaned"] = df["Abstract"].apply(clean_text)
df["genes"] = df["cleaned"].apply(lambda text: list(extract_genes(text, gene_list)))
df["proteins"] = df["cleaned"].apply(extract_proteins)
df["diseases"] = df["cleaned"].apply(extract_diseases)
df["symptoms"] = df["cleaned"].apply(extract_symptoms)

# Sauvegarder dans un nouveau fichier CSV
df.to_csv("/content/abstracts07_06_annotated_custom.csv", index=False)

print("✅ Annotation terminée. Résultat enregistré dans 'abstracts_annotated_custom.csv'")


✅ Annotation terminée. Résultat enregistré dans 'abstracts_annotated_custom.csv'


In [None]:
####
import pandas as pd
import ast
import re

# Charger les données
df = pd.read_csv("/content/abstracts07_06_annotated_custom.csv")

# Fonction de transformation en BIO
def annotate_bio(text, gene_list, protein_list, disease_list, symptom_list):
    words = text.split()
    labels = ["O"] * len(words)

    # Fusionner toutes les entités avec leur label
    entity_map = []
    for term in gene_list:
        entity_map.append((term.lower(), "GENE"))
    for term in protein_list:
        entity_map.append((term.lower(), "PROTEIN"))
    for term in disease_list:
        entity_map.append((term.lower(), "DISEASE"))
    for term in symptom_list:
        entity_map.append((term.lower(), "SYMPTOM"))

    # Annoter le texte mot à mot
    for entity, label in entity_map:
        entity_words = entity.split()
        entity_len = len(entity_words)

        for i in range(len(words) - entity_len + 1):
            window = words[i:i+entity_len]
            if [w.lower() for w in window] == entity_words:
                labels[i] = f"B-{label}"
                for j in range(1, entity_len):
                    labels[i+j] = f"I-{label}"

    return list(zip(words, labels))

# Fonction utilitaire pour transformer les chaînes en listes
def safe_list(val):
    if pd.isna(val) or val == '':
        return []
    if isinstance(val, list):
        return val
    try:
        return ast.literal_eval(val)
    except:
        return []

# Application à tout le DataFrame
bio_data = []

for idx, row in df.iterrows():
    text = row['Abstract']
    genes = safe_list(row.get("genes", []))
    proteins = safe_list(row.get("proteins", []))
    diseases = safe_list(row.get("diseases", []))
    symptoms = safe_list(row.get("symptoms", []))

    bio_tokens = annotate_bio(text, genes, proteins, diseases, symptoms)

    for word, label in bio_tokens:
        bio_data.append({
            "token": word,
            "label": label,
            "abstract_id": idx
        })

# Sauvegarde en CSV
bio_df = pd.DataFrame(bio_data)
bio_df.to_csv("/content/abstracts_bio_format.csv", index=False)

print("✅ Fichier BIO généré : /content/abstracts_bio_format.csv")


✅ Fichier BIO généré : /content/abstracts_bio_format.csv


In [None]:
import pandas as pd

# Charger le fichier BIO
bio_df = pd.read_csv("/content/abstracts_bio_format.csv")

# Compter les occurrences de chaque label
label_counts = bio_df['label'].value_counts()

# Afficher les résultats
print("📊 Nombre d'occurrences par label :")
print(label_counts)


📊 Nombre d'occurrences par label :
label
O            137750
B-DISEASE      1084
B-PROTEIN       853
B-GENE          349
I-PROTEIN        66
B-SYMPTOM        30
I-SYMPTOM         3
Name: count, dtype: int64


In [None]:
import pandas as pd

df = pd.read_csv("/content/abstracts07_06_annotated_custom.csv")
print(df.columns.tolist())


['PMID', 'Category', 'Abstract', 'cleaned', 'genes', 'proteins', 'diseases', 'symptoms']


In [None]:
import pandas as pd

# Charger les annotations
df = pd.read_csv("/content/abstracts07_06_annotated_custom.csv")

# Initialiser les compteurs
total_genes = 0
total_proteins = 0
total_diseases = 0
total_symptoms = 0

# Fonction pour compter les entités dans une cellule
def count_entities(cell):
    if pd.isna(cell) or cell.strip() == "":
        return 0
    return len(cell.split(','))

# Compter pour chaque ligne
for _, row in df.iterrows():
    total_genes += count_entities(row['genes'])
    total_proteins += count_entities(row['proteins'])
    total_diseases += count_entities(row['diseases'])
    total_symptoms += count_entities(row['symptoms'])

# Résultats
print("🧬 Nombre total de gènes :", total_genes)
print("🧪 Nombre total de protéines :", total_proteins)
print("🦠 Nombre total de maladies :", total_diseases)
print("🤒 Nombre total de symptômes :", total_symptoms)


🧬 Nombre total de gènes : 458
🧪 Nombre total de protéines : 815
🦠 Nombre total de maladies : 630
🤒 Nombre total de symptômes : 331


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Charger le fichier original
df = pd.read_csv("/content/abstracts07_06_annotated_custom.csv")

# 80% entraînement, 10% validation, 10% test
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Sauvegarder les splits
train_df.to_csv("/content/abstracts07_06_train.csv", index=False)
val_df.to_csv("/content/abstracts07_06_val.csv", index=False)
test_df.to_csv("/content/abstracts07_06_test.csv", index=False)

print(f"✅ Split terminé :")
print(f"- Entraînement : {len(train_df)} lignes")
print(f"- Validation   : {len(val_df)} lignes")
print(f"- Test         : {len(test_df)} lignes")


✅ Split terminé :
- Entraînement : 256 lignes
- Validation   : 32 lignes
- Test         : 32 lignes


# Premiere Model

In [None]:
train_df = pd.read_csv("/content/abstracts07_06_train.csv")
val_df   = pd.read_csv("/content/abstracts07_06_val.csv")
test_df  = pd.read_csv("/content/abstracts07_06_test.csv")


In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import ast

# Download 'punkt' for word_tokenize (already done, but good practice to keep)
nltk.download('punkt')

# Download the specific 'punkt_tab' resource mentioned in the error
nltk.download('punkt_tab')


# 📌 Fonction principale pour annotation BIO
def annotate_bio(df, output_path):
    with open(output_path, "w", encoding="utf-8") as fout:
        for _, row in df.iterrows():
            abstract = row['cleaned']  # ✅ colonne contenant le texte nettoyé

            # Assurez-vous que la colonne 'cleaned' n'est pas vide ou None
            if pd.isna(abstract) or abstract.strip() == "":
                 continue # Ignore les lignes sans texte nettoyé

            try:
                # Utiliser un ensemble vide si la colonne n'existe pas
                # ou si l'évaluation échoue
                genes = set(ast.literal_eval(row.get('genes', '[]')))
                proteins = set(ast.literal_eval(row.get('proteins', '[]')))
                diseases = set(ast.literal_eval(row.get('diseases', '[]')))
                symptoms = set(ast.literal_eval(row.get('symptoms', '[]')))
            except Exception as e:
                print(f"Erreur parsing à la ligne {row.name} : {e}")
                continue

            # Gérer les cas où abstract pourrait être None ou non-string
            if not isinstance(abstract, str):
                 print(f"Ligne {row.name} a un type d'abstract inattendu: {type(abstract)}")
                 continue


            tokens = word_tokenize(abstract)

            for token in tokens:
                tag = "O"
                token_lower = token.lower()
                token_upper = token.upper()

                # Itérer sur les ensembles d'entités
                if token_upper in genes:
                    tag = "B-GENE"
                elif token_lower in proteins:
                    tag = "B-PROTEIN"
                elif token_lower in diseases:
                    tag = "B-DISEASE"
                elif token_lower in symptoms:
                    tag = "B-SYMPTOM"

                fout.write(f"{token} {tag}\n")
            fout.write("\n")

    print(f"✅ Fichier BIO sauvegardé dans : {output_path}")

# 🔄 Chargement des fichiers
train_df = pd.read_csv("/content/abstracts07_06_train.csv")
val_df   = pd.read_csv("/content/abstracts07_06_val.csv")
test_df  = pd.read_csv("/content/abstracts07_06_test.csv")

# 📝 Application
annotate_bio(train_df, "/content/abstracts07_06_bio_train.bio")
annotate_bio(val_df, "/content/abstracts07_06_bio_val.bio")
annotate_bio(test_df, "/content/abstracts07_06_bio_test.bio")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_train.bio
✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_val.bio
✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_test.bio


In [None]:
#Elle ignore les tokens "O" dans les labels pendant l'entraînement.

from datasets import Dataset
from transformers import AutoTokenizer

# 📌 Modèle utilisé (BioBERT)
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 🏷️ Étiquettes
label_list = ['O', 'B-GENE', 'B-PROTEIN', 'B-DISEASE', 'B-SYMPTOM']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


# 📄 Lire un fichier BIO et l'organiser par phrase
def read_bio_file(filepath):
    sentences = []
    labels = []
    current_tokens = []
    current_labels = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == '':
                if current_tokens:
                    sentences.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []
            else:
                splits = line.strip().split()
                if len(splits) == 2:
                    token, tag = splits
                    current_tokens.append(token)
                    current_labels.append(tag)

        # Ajouter la dernière phrase si non vide
        if current_tokens:
            sentences.append(current_tokens)
            labels.append(current_labels)

    return sentences, labels


# ✂️ Tokenisation + alignement des labels avec gestion du max_length
#    et exclusion des tokens "O" (ignorés avec -100)
def tokenize_and_align(sentences, tags, tokenizer):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors=None
    )

    aligned_labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                tag = label[word_idx]
                label_ids.append(label2id[tag] if tag != "O" else -100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


# 📁 Préparer un Dataset HuggingFace depuis un fichier BIO
def prepare_dataset_from_bio(filepath):
    sents, tags = read_bio_file(filepath)
    tokenized = tokenize_and_align(sents, tags, tokenizer)
    return Dataset.from_dict(tokenized)


# ✅ Créer les datasets avec les bons fichiers
train_dataset = prepare_dataset_from_bio("/content/abstracts07_06_bio_train.bio")
val_dataset   = prepare_dataset_from_bio("/content/abstracts07_06_bio_val.bio")
test_dataset  = prepare_dataset_from_bio("/content/abstracts07_06_bio_test.bio")

# 🔍 Vérification
print("✅ Datasets créés :")
print("Exemple train_dataset :")
print(train_dataset[0])



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

✅ Datasets créés :
Exemple train_dataset :
{'input_ids': [101, 122, 24443, 22572, 4060, 2496, 1161, 17881, 1571, 14516, 1643, 122, 16222, 1571, 3236, 23117, 22997, 9465, 1275, 7393, 1545, 179, 170, 2599, 17881, 1571, 3236, 23117, 22997, 174, 16091, 1830, 17881, 1571, 1336, 1695, 15139, 17599, 2087, 7535, 2386, 1596, 11140, 3919, 4164, 6099, 1751, 11432, 2022, 3023, 3507, 17960, 20942, 181, 1182, 177, 122, 11437, 1162, 194, 123, 192, 1358, 194, 124, 11019, 1186, 194, 125, 181, 19009, 192, 126, 195, 22235, 187, 127, 175, 14429, 193, 128, 5871, 1186, 187, 129, 2351, 1869, 122, 1278, 22759, 5144, 1161, 2657, 2755, 1131, 15449, 2118, 6745, 11964, 1477, 5144, 1161, 2853, 12844, 4807, 21718, 2605, 16198, 8117, 1278, 1470, 2332, 2364, 2657, 2755, 1129, 23784, 2118, 6087, 1545, 1580, 5144, 1161, 2853, 3653, 13347, 1654, 1148, 2657, 2057, 5144, 6420, 185, 1742, 1704, 2704, 1129, 23784, 2118, 1620, 1604, 24239, 5144, 1161, 4828, 4134, 17801, 24606, 24766, 1545, 1580, 1580, 1559, 19207, 3254, 123,

In [None]:
!pip install seqeval


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=58e0ae9d41eb3d39f695bf09fb1c0dc94e601e6c398dfc83775a9cb2bf3c9623
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, accuracy_score, f1_score

# ✅ Charger le modèle pré-entraîné
model = AutoModelForTokenClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 📊 Métriques d’évaluation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions, output_dict=False)
    }

# 🔧 Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./ner_biobert",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
)

# 🚀 Entraînement avec Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhafsaikram31[0m ([33mhafsaikram31-cadi-ayyad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.56
100,0.1735


TrainOutput(global_step=128, training_loss=0.30564939603209496, metrics={'train_runtime': 117.4076, 'train_samples_per_second': 8.722, 'train_steps_per_second': 1.09, 'total_flos': 267575136092160.0, 'train_loss': 0.30564939603209496, 'epoch': 4.0})

In [None]:
# Évaluer le modèle sur le jeu de validation
results = trainer.evaluate()

# Affichage des métriques principales
print("📊 Résultats de l'évaluation :")
print(f"Accuracy : {results['eval_accuracy']:.4f}")
print(f"F1-score : {results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(results['eval_report'])


📊 Résultats de l'évaluation :
Accuracy : 0.9504
F1-score : 0.9504

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.98      0.97      0.97        94
        GENE       0.88      0.98      0.93        45
     PROTEIN       0.96      0.93      0.95       102
     SYMPTOM       0.00      0.00      0.00         1

   micro avg       0.95      0.95      0.95       242
   macro avg       0.70      0.72      0.71       242
weighted avg       0.95      0.95      0.95       242



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# 📊 Évaluation sur le jeu de test
test_results = trainer.evaluate(eval_dataset=test_dataset)

# 📋 Affichage des métriques
print("📊 Résultats sur le jeu de test :")
print(f"Accuracy : {test_results['eval_accuracy']:.4f}")
print(f"F1-score : {test_results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(test_results['eval_report'])


📊 Résultats sur le jeu de test :
Accuracy : 0.9648
F1-score : 0.9648

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.97      1.00      0.99       101
        GENE       0.91      0.94      0.92        52
     PROTEIN       1.00      0.95      0.97        73
     SYMPTOM       0.00      0.00      0.00         1

   micro avg       0.96      0.96      0.96       227
   macro avg       0.72      0.72      0.72       227
weighted avg       0.96      0.96      0.96       227



In [None]:
import shutil
import os

source_dir = "/content"
target_dir = "/content/drive/MyDrive/NLP_dernier_modification_09_06"

# Créer le dossier cible s’il n’existe pas
os.makedirs(target_dir, exist_ok=True)

# Parcourir tous les fichiers/dossiers dans /content sauf /content/drive
for item in os.listdir(source_dir):
    src_path = os.path.join(source_dir, item)
    dst_path = os.path.join(target_dir, item)

    if item == "drive":
        continue  # ⚠️ Ignorer le dossier Google Drive

    try:
        if os.path.isdir(src_path):
            shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
        else:
            shutil.copy2(src_path, dst_path)
    except Exception as e:
        print(f"Erreur en copiant {item} : {e}")

print(f"✅ Tous les fichiers (sauf /drive) ont été copiés vers : {target_dir}")


✅ Tous les fichiers (sauf /drive) ont été copiés vers : /content/drive/MyDrive/NLP_dernier_modification_09_06


## Test d'abilation sur data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Charger le fichier original
df = pd.read_csv("/content/abstracts07_06_annotated_custom.csv")

# Différentes configurations à tester
configurations = [
    # (train%, val%, test%, nom)
    (0.8, 0.1, 0.1, "80_10_10"),     # Original
    (0.7, 0.15, 0.15, "70_15_15"),   # Plus équilibré
    (0.9, 0.05, 0.05, "90_5_5"),     # Plus d'entraînement
    (0.6, 0.2, 0.2, "60_20_20"),     # Plus de validation/test
    (0.75, 0.125, 0.125, "75_12.5_12.5"), # Intermédiaire
]

print(f"📊 Dataset total : {len(df)} lignes\n")

for train_pct, val_pct, test_pct, name in configurations:
    print(f"🔄 Configuration {name} (Train: {train_pct:.1%}, Val: {val_pct:.1%}, Test: {test_pct:.1%})")

    # Premier split : train vs (val+test)
    train_df, temp_df = train_test_split(df, test_size=(val_pct + test_pct), random_state=42)

    # Deuxième split : val vs test
    val_df, test_df = train_test_split(temp_df, test_size=(test_pct/(val_pct + test_pct)), random_state=42)

    # Sauvegarder avec suffixe
    train_df.to_csv(f"/content/abstracts07_06_train_{name}.csv", index=False)
    val_df.to_csv(f"/content/abstracts07_06_val_{name}.csv", index=False)
    test_df.to_csv(f"/content/abstracts07_06_test_{name}.csv", index=False)

    print(f"  ✅ Entraînement : {len(train_df)} lignes ({len(train_df)/len(df):.1%})")
    print(f"  ✅ Validation   : {len(val_df)} lignes ({len(val_df)/len(df):.1%})")
    print(f"  ✅ Test         : {len(test_df)} lignes ({len(test_df)/len(df):.1%})")
    print(f"  📁 Fichiers sauvegardés avec suffixe '_{name}'\n")

print("✅ Tous les splits terminés !")
print("\nFichiers créés :")
for _, _, _, name in configurations:
    print(f"  - abstracts07_06_train_{name}.csv")
    print(f"  - abstracts07_06_val_{name}.csv")
    print(f"  - abstracts07_06_test_{name}.csv")

📊 Dataset total : 320 lignes

🔄 Configuration 80_10_10 (Train: 80.0%, Val: 10.0%, Test: 10.0%)
  ✅ Entraînement : 256 lignes (80.0%)
  ✅ Validation   : 32 lignes (10.0%)
  ✅ Test         : 32 lignes (10.0%)
  📁 Fichiers sauvegardés avec suffixe '_80_10_10'

🔄 Configuration 70_15_15 (Train: 70.0%, Val: 15.0%, Test: 15.0%)
  ✅ Entraînement : 224 lignes (70.0%)
  ✅ Validation   : 48 lignes (15.0%)
  ✅ Test         : 48 lignes (15.0%)
  📁 Fichiers sauvegardés avec suffixe '_70_15_15'

🔄 Configuration 90_5_5 (Train: 90.0%, Val: 5.0%, Test: 5.0%)
  ✅ Entraînement : 288 lignes (90.0%)
  ✅ Validation   : 16 lignes (5.0%)
  ✅ Test         : 16 lignes (5.0%)
  📁 Fichiers sauvegardés avec suffixe '_90_5_5'

🔄 Configuration 60_20_20 (Train: 60.0%, Val: 20.0%, Test: 20.0%)
  ✅ Entraînement : 192 lignes (60.0%)
  ✅ Validation   : 64 lignes (20.0%)
  ✅ Test         : 64 lignes (20.0%)
  📁 Fichiers sauvegardés avec suffixe '_60_20_20'

🔄 Configuration 75_12.5_12.5 (Train: 75.0%, Val: 12.5%, Test: 12.5

### _90_5_5

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import ast

# Download 'punkt' for word_tokenize (already done, but good practice to keep)
nltk.download('punkt')

# Download the specific 'punkt_tab' resource mentioned in the error
nltk.download('punkt_tab')


# 📌 Fonction principale pour annotation BIO
def annotate_bio(df, output_path):
    with open(output_path, "w", encoding="utf-8") as fout:
        for _, row in df.iterrows():
            abstract = row['cleaned']  # ✅ colonne contenant le texte nettoyé

            # Assurez-vous que la colonne 'cleaned' n'est pas vide ou None
            if pd.isna(abstract) or abstract.strip() == "":
                 continue # Ignore les lignes sans texte nettoyé

            try:
                # Utiliser un ensemble vide si la colonne n'existe pas
                # ou si l'évaluation échoue
                genes = set(ast.literal_eval(row.get('genes', '[]')))
                proteins = set(ast.literal_eval(row.get('proteins', '[]')))
                diseases = set(ast.literal_eval(row.get('diseases', '[]')))
                symptoms = set(ast.literal_eval(row.get('symptoms', '[]')))
            except Exception as e:
                print(f"Erreur parsing à la ligne {row.name} : {e}")
                continue

            # Gérer les cas où abstract pourrait être None ou non-string
            if not isinstance(abstract, str):
                 print(f"Ligne {row.name} a un type d'abstract inattendu: {type(abstract)}")
                 continue


            tokens = word_tokenize(abstract)

            for token in tokens:
                tag = "O"
                token_lower = token.lower()
                token_upper = token.upper()

                # Itérer sur les ensembles d'entités
                if token_upper in genes:
                    tag = "B-GENE"
                elif token_lower in proteins:
                    tag = "B-PROTEIN"
                elif token_lower in diseases:
                    tag = "B-DISEASE"
                elif token_lower in symptoms:
                    tag = "B-SYMPTOM"

                fout.write(f"{token} {tag}\n")
            fout.write("\n")

    print(f"✅ Fichier BIO sauvegardé dans : {output_path}")

# 🔄 Chargement des fichiers
train_df = pd.read_csv("/content/abstracts07_06_train_90_5_5.csv")
val_df   = pd.read_csv("/content/abstracts07_06_val_90_5_5.csv")
test_df  = pd.read_csv("/content/abstracts07_06_test_90_5_5.csv")

# 📝 Application
annotate_bio(train_df, "/content/abstracts07_06_bio_train_90_5_5.bio")
annotate_bio(val_df, "/content/abstracts07_06_bio_val_90_5_5.bio")
annotate_bio(test_df, "/content/abstracts07_06_bio_test_90_5_5.bio")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_train_90_5_5.bio
✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_val_90_5_5.bio
✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_test_90_5_5.bio


In [None]:
#Elle ignore les tokens "O" dans les labels pendant l'entraînement.

from datasets import Dataset
from transformers import AutoTokenizer

# 📌 Modèle utilisé (BioBERT)
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 🏷️ Étiquettes
label_list = ['O', 'B-GENE', 'B-PROTEIN', 'B-DISEASE', 'B-SYMPTOM']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


# 📄 Lire un fichier BIO et l'organiser par phrase
def read_bio_file(filepath):
    sentences = []
    labels = []
    current_tokens = []
    current_labels = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == '':
                if current_tokens:
                    sentences.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []
            else:
                splits = line.strip().split()
                if len(splits) == 2:
                    token, tag = splits
                    current_tokens.append(token)
                    current_labels.append(tag)

        # Ajouter la dernière phrase si non vide
        if current_tokens:
            sentences.append(current_tokens)
            labels.append(current_labels)

    return sentences, labels


# ✂️ Tokenisation + alignement des labels avec gestion du max_length
#    et exclusion des tokens "O" (ignorés avec -100)
def tokenize_and_align(sentences, tags, tokenizer):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors=None
    )

    aligned_labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                tag = label[word_idx]
                label_ids.append(label2id[tag] if tag != "O" else -100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


# 📁 Préparer un Dataset HuggingFace depuis un fichier BIO
def prepare_dataset_from_bio(filepath):
    sents, tags = read_bio_file(filepath)
    tokenized = tokenize_and_align(sents, tags, tokenizer)
    return Dataset.from_dict(tokenized)


# ✅ Créer les datasets avec les bons fichiers
train_dataset = prepare_dataset_from_bio("//content/abstracts07_06_bio_train_90_5_5.bio")
val_dataset   = prepare_dataset_from_bio("/content/abstracts07_06_bio_val_90_5_5.bio")
test_dataset  = prepare_dataset_from_bio("/content/abstracts07_06_bio_test_90_5_5.bio")

# 🔍 Vérification
print("✅ Datasets créés :")
print("Exemple train_dataset :")
print(train_dataset[0])



✅ Datasets créés :
Exemple train_dataset :
{'input_ids': [101, 122, 25128, 4386, 1306, 185, 7111, 1918, 2528, 1233, 17881, 1571, 179, 3488, 127, 12737, 1568, 21336, 9465, 1275, 7393, 1545, 179, 171, 1665, 1643, 17881, 1571, 12737, 1568, 21336, 3294, 3075, 5911, 2393, 1377, 1830, 1324, 1559, 2394, 3052, 26410, 13499, 3946, 2629, 16516, 6602, 15796, 3850, 172, 1548, 1643, 16236, 1394, 10645, 12104, 16042, 3773, 23220, 4993, 177, 122, 177, 1358, 193, 123, 195, 10436, 1403, 181, 124, 16358, 2118, 194, 125, 2351, 1869, 122, 2853, 1137, 1582, 15680, 4724, 188, 17204, 10390, 8281, 1234, 188, 2704, 11371, 1403, 3454, 2755, 1278, 5182, 188, 17204, 10390, 1539, 1559, 1477, 185, 1197, 5144, 1161, 1278, 1297, 2332, 8614, 175, 20257, 9513, 2755, 2598, 2815, 175, 14875, 14640, 8301, 10424, 1580, 185, 1197, 5144, 1161, 123, 1131, 19411, 10436, 2057, 13306, 3653, 1654, 13347, 1131, 19411, 10436, 8918, 4167, 21943, 4807, 1131, 19411, 10436, 4062, 18910, 10973, 185, 1197, 5144, 1161, 124, 2853, 1137, 15

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, accuracy_score, f1_score

# ✅ Charger le modèle pré-entraîné
model = AutoModelForTokenClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 📊 Métriques d’évaluation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions, output_dict=False)
    }

# 🔧 Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./ner_biobert",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
)

# 🚀 Entraînement avec Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.5585
100,0.1244


TrainOutput(global_step=144, training_loss=0.2566818479034636, metrics={'train_runtime': 119.0834, 'train_samples_per_second': 9.674, 'train_steps_per_second': 1.209, 'total_flos': 301022028103680.0, 'train_loss': 0.2566818479034636, 'epoch': 4.0})

In [None]:
# Évaluer le modèle sur le jeu de validation
results = trainer.evaluate()

# Affichage des métriques principales
print("📊 Résultats de l'évaluation :")
print(f"Accuracy : {results['eval_accuracy']:.4f}")
print(f"F1-score : {results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(results['eval_report'])


📊 Résultats de l'évaluation :
Accuracy : 1.0000
F1-score : 1.0000

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       1.00      1.00      1.00        49
        GENE       1.00      1.00      1.00        38
     PROTEIN       1.00      1.00      1.00         5
     SYMPTOM       1.00      1.00      1.00         1

   micro avg       1.00      1.00      1.00        93
   macro avg       1.00      1.00      1.00        93
weighted avg       1.00      1.00      1.00        93



In [None]:
# 📊 Évaluation sur le jeu de test
test_results = trainer.evaluate(eval_dataset=test_dataset)

# 📋 Affichage des métriques
print("📊 Résultats sur le jeu de test :")
print(f"Accuracy : {test_results['eval_accuracy']:.4f}")
print(f"F1-score : {test_results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(test_results['eval_report'])


📊 Résultats sur le jeu de test :
Accuracy : 0.9417
F1-score : 0.9417

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.97      1.00      0.98        61
        GENE       0.96      0.83      0.89        29
     PROTEIN       0.80      0.92      0.86        13

   micro avg       0.94      0.94      0.94       103
   macro avg       0.91      0.92      0.91       103
weighted avg       0.94      0.94      0.94       103



In [None]:
import shutil
import os

source_dir = "/content"
target_dir = "/content/drive/MyDrive/NLP_dernier_modification_09_06_90_5_5"

# Créer le dossier cible s’il n’existe pas
os.makedirs(target_dir, exist_ok=True)

# Parcourir tous les fichiers/dossiers dans /content sauf /content/drive
for item in os.listdir(source_dir):
    src_path = os.path.join(source_dir, item)
    dst_path = os.path.join(target_dir, item)

    if item == "drive":
        continue  # ⚠️ Ignorer le dossier Google Drive

    try:
        if os.path.isdir(src_path):
            shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
        else:
            shutil.copy2(src_path, dst_path)
    except Exception as e:
        print(f"Erreur en copiant {item} : {e}")

print(f"✅ Tous les fichiers (sauf /drive) ont été copiés vers : {target_dir}")


✅ Tous les fichiers (sauf /drive) ont été copiés vers : /content/drive/MyDrive/NLP_dernier_modification_09_06_90_5_5


------------------------------------------------------------------------

### _70_15_15

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import ast

# Download 'punkt' for word_tokenize (already done, but good practice to keep)
nltk.download('punkt')

# Download the specific 'punkt_tab' resource mentioned in the error
nltk.download('punkt_tab')


# 📌 Fonction principale pour annotation BIO
def annotate_bio(df, output_path):
    with open(output_path, "w", encoding="utf-8") as fout:
        for _, row in df.iterrows():
            abstract = row['cleaned']  # ✅ colonne contenant le texte nettoyé

            # Assurez-vous que la colonne 'cleaned' n'est pas vide ou None
            if pd.isna(abstract) or abstract.strip() == "":
                 continue # Ignore les lignes sans texte nettoyé

            try:
                # Utiliser un ensemble vide si la colonne n'existe pas
                # ou si l'évaluation échoue
                genes = set(ast.literal_eval(row.get('genes', '[]')))
                proteins = set(ast.literal_eval(row.get('proteins', '[]')))
                diseases = set(ast.literal_eval(row.get('diseases', '[]')))
                symptoms = set(ast.literal_eval(row.get('symptoms', '[]')))
            except Exception as e:
                print(f"Erreur parsing à la ligne {row.name} : {e}")
                continue

            # Gérer les cas où abstract pourrait être None ou non-string
            if not isinstance(abstract, str):
                 print(f"Ligne {row.name} a un type d'abstract inattendu: {type(abstract)}")
                 continue


            tokens = word_tokenize(abstract)

            for token in tokens:
                tag = "O"
                token_lower = token.lower()
                token_upper = token.upper()

                # Itérer sur les ensembles d'entités
                if token_upper in genes:
                    tag = "B-GENE"
                elif token_lower in proteins:
                    tag = "B-PROTEIN"
                elif token_lower in diseases:
                    tag = "B-DISEASE"
                elif token_lower in symptoms:
                    tag = "B-SYMPTOM"

                fout.write(f"{token} {tag}\n")
            fout.write("\n")

    print(f"✅ Fichier BIO sauvegardé dans : {output_path}")

# 🔄 Chargement des fichiers
train_df = pd.read_csv("/content/abstracts07_06_train_70_15_15.csv")
val_df   = pd.read_csv("/content/abstracts07_06_val_70_15_15.csv")
test_df  = pd.read_csv("/content/abstracts07_06_test_70_15_15.csv")

# 📝 Application
annotate_bio(train_df, "/content/abstracts07_06_bio_train_70_15_15.bio")
annotate_bio(val_df, "/content/abstracts07_06_bio_val_70_15_15.bio")
annotate_bio(test_df, "/content/abstracts07_06_bio_test_70_15_15.bio")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_train_70_15_15.bio
✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_val_70_15_15.bio
✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_test_70_15_15.bio


In [None]:
#Elle ignore les tokens "O" dans les labels pendant l'entraînement.

from datasets import Dataset
from transformers import AutoTokenizer

# 📌 Modèle utilisé (BioBERT)
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 🏷️ Étiquettes
label_list = ['O', 'B-GENE', 'B-PROTEIN', 'B-DISEASE', 'B-SYMPTOM']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


# 📄 Lire un fichier BIO et l'organiser par phrase
def read_bio_file(filepath):
    sentences = []
    labels = []
    current_tokens = []
    current_labels = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == '':
                if current_tokens:
                    sentences.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []
            else:
                splits = line.strip().split()
                if len(splits) == 2:
                    token, tag = splits
                    current_tokens.append(token)
                    current_labels.append(tag)

        # Ajouter la dernière phrase si non vide
        if current_tokens:
            sentences.append(current_tokens)
            labels.append(current_labels)

    return sentences, labels


# ✂️ Tokenisation + alignement des labels avec gestion du max_length
#    et exclusion des tokens "O" (ignorés avec -100)
def tokenize_and_align(sentences, tags, tokenizer):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors=None
    )

    aligned_labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                tag = label[word_idx]
                label_ids.append(label2id[tag] if tag != "O" else -100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


# 📁 Préparer un Dataset HuggingFace depuis un fichier BIO
def prepare_dataset_from_bio(filepath):
    sents, tags = read_bio_file(filepath)
    tokenized = tokenize_and_align(sents, tags, tokenizer)
    return Dataset.from_dict(tokenized)


# ✅ Créer les datasets avec les bons fichiers
train_dataset = prepare_dataset_from_bio("//content/abstracts07_06_bio_train_70_15_15.bio")
val_dataset   = prepare_dataset_from_bio("/content/abstracts07_06_bio_val_70_15_15.bio")
test_dataset  = prepare_dataset_from_bio("/content/abstracts07_06_bio_test_70_15_15.bio")

# 🔍 Vérification
print("✅ Datasets créés :")
print("Exemple train_dataset :")
print(train_dataset[0])



✅ Datasets créés :
Exemple train_dataset :
{'input_ids': [101, 122, 195, 15564, 12491, 194, 1182, 193, 4175, 195, 1161, 195, 3031, 17881, 1571, 179, 3488, 1275, 8359, 1659, 12445, 11004, 9465, 1275, 3413, 16480, 3975, 1161, 179, 172, 1179, 14541, 18202, 26303, 17881, 11049, 23117, 1475, 3135, 1580, 1604, 1477, 3772, 6600, 2272, 18250, 3246, 1558, 16798, 3621, 25575, 1958, 1664, 1353, 2765, 13093, 4182, 4420, 1606, 1407, 175, 175, 1181, 1403, 11109, 172, 1204, 3342, 5144, 6420, 11108, 1907, 5144, 6420, 6654, 181, 1182, 192, 122, 181, 1182, 179, 1324, 123, 195, 6583, 179, 1665, 122, 20049, 2118, 193, 1403, 122, 192, 1358, 179, 122, 195, 17204, 181, 3361, 123, 11078, 2118, 176, 2087, 122, 2351, 1869, 122, 2853, 4272, 5182, 1704, 2704, 2638, 5184, 2663, 23220, 1179, 1979, 2704, 2657, 1278, 9468, 21440, 2118, 2755, 9468, 21440, 2118, 13075, 7629, 1477, 5144, 1161, 123, 2853, 2070, 6360, 1704, 2704, 2638, 5184, 2663, 23220, 1179, 1979, 2704, 2657, 1278, 9468, 21440, 2118, 2755, 9468, 21440, 

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, accuracy_score, f1_score

# ✅ Charger le modèle pré-entraîné
model = AutoModelForTokenClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 📊 Métriques d’évaluation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions, output_dict=False)
    }

# 🔧 Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./ner_biobert",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
)

# 🚀 Entraînement avec Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.5942
100,0.1383


TrainOutput(global_step=112, training_loss=0.3349013738334179, metrics={'train_runtime': 136.8084, 'train_samples_per_second': 6.549, 'train_steps_per_second': 0.819, 'total_flos': 234128244080640.0, 'train_loss': 0.3349013738334179, 'epoch': 4.0})

In [None]:
# Évaluer le modèle sur le jeu de validation
results = trainer.evaluate()

# Affichage des métriques principales
print("📊 Résultats de l'évaluation :")
print(f"Accuracy : {results['eval_accuracy']:.4f}")
print(f"F1-score : {results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(results['eval_report'])


📊 Résultats de l'évaluation :
Accuracy : 0.9549
F1-score : 0.9549

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.99      0.98      0.98       162
        GENE       0.89      0.97      0.93        64
     PROTEIN       0.95      0.95      0.95       148
     SYMPTOM       0.00      0.00      0.00         3

   micro avg       0.95      0.95      0.95       377
   macro avg       0.71      0.72      0.71       377
weighted avg       0.95      0.95      0.95       377



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# 📊 Évaluation sur le jeu de test
test_results = trainer.evaluate(eval_dataset=test_dataset)

# 📋 Affichage des métriques
print("📊 Résultats sur le jeu de test :")
print(f"Accuracy : {test_results['eval_accuracy']:.4f}")
print(f"F1-score : {test_results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(test_results['eval_report'])


📊 Résultats sur le jeu de test :
Accuracy : 0.9563
F1-score : 0.9563

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.98      1.00      0.99       144
        GENE       0.90      0.94      0.92        78
     PROTEIN       0.97      0.91      0.94        97
     SYMPTOM       1.00      1.00      1.00         1

   micro avg       0.96      0.96      0.96       320
   macro avg       0.96      0.96      0.96       320
weighted avg       0.96      0.96      0.96       320



In [None]:
import shutil
import os

source_dir = "/content"
target_dir = "/content/drive/MyDrive/NLP_dernier_modification_09_06_70_15_15"

# Créer le dossier cible s’il n’existe pas
os.makedirs(target_dir, exist_ok=True)

# Parcourir tous les fichiers/dossiers dans /content sauf /content/drive
for item in os.listdir(source_dir):
    src_path = os.path.join(source_dir, item)
    dst_path = os.path.join(target_dir, item)

    if item == "drive":
        continue  # ⚠️ Ignorer le dossier Google Drive

    try:
        if os.path.isdir(src_path):
            shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
        else:
            shutil.copy2(src_path, dst_path)
    except Exception as e:
        print(f"Erreur en copiant {item} : {e}")

print(f"✅ Tous les fichiers (sauf /drive) ont été copiés vers : {target_dir}")


✅ Tous les fichiers (sauf /drive) ont été copiés vers : /content/drive/MyDrive/NLP_dernier_modification_09_06_70_15_15


_______________________________________________________________

### _60_20_20

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import ast

# Download 'punkt' for word_tokenize (already done, but good practice to keep)
nltk.download('punkt')

# Download the specific 'punkt_tab' resource mentioned in the error
nltk.download('punkt_tab')


# 📌 Fonction principale pour annotation BIO
def annotate_bio(df, output_path):
    with open(output_path, "w", encoding="utf-8") as fout:
        for _, row in df.iterrows():
            abstract = row['cleaned']  # ✅ colonne contenant le texte nettoyé

            # Assurez-vous que la colonne 'cleaned' n'est pas vide ou None
            if pd.isna(abstract) or abstract.strip() == "":
                 continue # Ignore les lignes sans texte nettoyé

            try:
                # Utiliser un ensemble vide si la colonne n'existe pas
                # ou si l'évaluation échoue
                genes = set(ast.literal_eval(row.get('genes', '[]')))
                proteins = set(ast.literal_eval(row.get('proteins', '[]')))
                diseases = set(ast.literal_eval(row.get('diseases', '[]')))
                symptoms = set(ast.literal_eval(row.get('symptoms', '[]')))
            except Exception as e:
                print(f"Erreur parsing à la ligne {row.name} : {e}")
                continue

            # Gérer les cas où abstract pourrait être None ou non-string
            if not isinstance(abstract, str):
                 print(f"Ligne {row.name} a un type d'abstract inattendu: {type(abstract)}")
                 continue


            tokens = word_tokenize(abstract)

            for token in tokens:
                tag = "O"
                token_lower = token.lower()
                token_upper = token.upper()

                # Itérer sur les ensembles d'entités
                if token_upper in genes:
                    tag = "B-GENE"
                elif token_lower in proteins:
                    tag = "B-PROTEIN"
                elif token_lower in diseases:
                    tag = "B-DISEASE"
                elif token_lower in symptoms:
                    tag = "B-SYMPTOM"

                fout.write(f"{token} {tag}\n")
            fout.write("\n")

    print(f"✅ Fichier BIO sauvegardé dans : {output_path}")

# 🔄 Chargement des fichiers
train_df = pd.read_csv("/content/abstracts07_06_train_60_20_20.csv")
val_df   = pd.read_csv("/content/abstracts07_06_val_60_20_20.csv")
test_df  = pd.read_csv("/content/abstracts07_06_test_60_20_20.csv")

# 📝 Application
annotate_bio(train_df, "/content/abstracts07_06_bio_train_60_20_20.bio")
annotate_bio(val_df, "/content/abstracts07_06_bio_val_60_20_20.bio")
annotate_bio(test_df, "/content/abstracts07_06_bio_test_60_20_20.bio")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_train_60_20_20.bio
✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_val_60_20_20.bio
✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_test_60_20_20.bio


In [None]:
#Elle ignore les tokens "O" dans les labels pendant l'entraînement.

from datasets import Dataset
from transformers import AutoTokenizer

# 📌 Modèle utilisé (BioBERT)
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 🏷️ Étiquettes
label_list = ['O', 'B-GENE', 'B-PROTEIN', 'B-DISEASE', 'B-SYMPTOM']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


# 📄 Lire un fichier BIO et l'organiser par phrase
def read_bio_file(filepath):
    sentences = []
    labels = []
    current_tokens = []
    current_labels = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == '':
                if current_tokens:
                    sentences.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []
            else:
                splits = line.strip().split()
                if len(splits) == 2:
                    token, tag = splits
                    current_tokens.append(token)
                    current_labels.append(tag)

        # Ajouter la dernière phrase si non vide
        if current_tokens:
            sentences.append(current_tokens)
            labels.append(current_labels)

    return sentences, labels


# ✂️ Tokenisation + alignement des labels avec gestion du max_length
#    et exclusion des tokens "O" (ignorés avec -100)
def tokenize_and_align(sentences, tags, tokenizer):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors=None
    )

    aligned_labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                tag = label[word_idx]
                label_ids.append(label2id[tag] if tag != "O" else -100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


# 📁 Préparer un Dataset HuggingFace depuis un fichier BIO
def prepare_dataset_from_bio(filepath):
    sents, tags = read_bio_file(filepath)
    tokenized = tokenize_and_align(sents, tags, tokenizer)
    return Dataset.from_dict(tokenized)


# ✅ Créer les datasets avec les bons fichiers
train_dataset = prepare_dataset_from_bio("/content/abstracts07_06_bio_train_60_20_20.bio")
val_dataset   = prepare_dataset_from_bio("/content/abstracts07_06_bio_val_60_20_20.bio")
test_dataset  = prepare_dataset_from_bio("/content/abstracts07_06_bio_test_60_20_20.bio")

# 🔍 Vérification
print("✅ Datasets créés :")
print("Exemple train_dataset :")
print(train_dataset[0])



✅ Datasets créés :
Exemple train_dataset :
{'input_ids': [101, 122, 188, 6617, 1231, 1643, 17881, 1571, 179, 3488, 127, 1405, 122, 1816, 1571, 9465, 1275, 9550, 1604, 188, 25892, 1571, 1580, 1604, 5507, 1571, 5347, 1580, 1571, 1545, 194, 8362, 17800, 1158, 3234, 9468, 2728, 1403, 1494, 1200, 9077, 1346, 10322, 9712, 1830, 26503, 23510, 1606, 12630, 4035, 25444, 2443, 1107, 16792, 185, 12937, 1161, 3262, 171, 1320, 13252, 2050, 175, 122, 176, 5709, 11071, 1320, 184, 123, 22245, 7291, 190, 124, 11580, 3740, 172, 125, 126, 22572, 10961, 16631, 172, 127, 1260, 5579, 1200, 194, 126, 1301, 19411, 1162, 173, 125, 3840, 27006, 176, 125, 2351, 1869, 122, 2587, 10093, 22572, 3484, 12809, 17288, 8362, 25105, 3150, 181, 13292, 1874, 1260, 9304, 26731, 12132, 23449, 1830, 8359, 1568, 9304, 13356, 5999, 1129, 1233, 5389, 1818, 175, 1643, 12937, 2225, 23449, 1830, 123, 8362, 11083, 181, 15136, 172, 1179, 1733, 4035, 1116, 1260, 181, 15136, 22233, 1200, 1306, 15276, 1197, 1571, 22737, 1580, 5682, 1306

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, accuracy_score, f1_score

# ✅ Charger le modèle pré-entraîné
model = AutoModelForTokenClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 📊 Métriques d’évaluation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions, output_dict=False)
    }

# 🔧 Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./ner_biobert",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
)

# 🚀 Entraînement avec Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.5503


TrainOutput(global_step=96, training_loss=0.3442710340023041, metrics={'train_runtime': 82.6827, 'train_samples_per_second': 9.289, 'train_steps_per_second': 1.161, 'total_flos': 200681352069120.0, 'train_loss': 0.3442710340023041, 'epoch': 4.0})

In [None]:
# Évaluer le modèle sur le jeu de validation
results = trainer.evaluate()

# Affichage des métriques principales
print("📊 Résultats de l'évaluation :")
print(f"Accuracy : {results['eval_accuracy']:.4f}")
print(f"F1-score : {results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(results['eval_report'])


📊 Résultats de l'évaluation :
Accuracy : 0.9110
F1-score : 0.9110

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.94      0.98      0.96       244
        GENE       0.80      0.97      0.88       113
     PROTEIN       0.95      0.82      0.88       210
     SYMPTOM       0.00      0.00      0.00         6

   micro avg       0.91      0.91      0.91       573
   macro avg       0.67      0.69      0.68       573
weighted avg       0.91      0.91      0.91       573



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# 📊 Évaluation sur le jeu de test
test_results = trainer.evaluate(eval_dataset=test_dataset)

# 📋 Affichage des métriques
print("📊 Résultats sur le jeu de test :")
print(f"Accuracy : {test_results['eval_accuracy']:.4f}")
print(f"F1-score : {test_results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(test_results['eval_report'])


📊 Résultats sur le jeu de test :
Accuracy : 0.9710
F1-score : 0.9710

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.97      1.00      0.98       200
        GENE       0.97      0.93      0.95        75
     PROTEIN       0.98      0.94      0.96       104

   micro avg       0.97      0.97      0.97       379
   macro avg       0.97      0.96      0.97       379
weighted avg       0.97      0.97      0.97       379



In [None]:
import shutil
import os

source_dir = "/content"
target_dir = "/content/drive/MyDrive/NLP_dernier_modification_09_06_60_20_20"

# Créer le dossier cible s’il n’existe pas
os.makedirs(target_dir, exist_ok=True)

# Parcourir tous les fichiers/dossiers dans /content sauf /content/drive
for item in os.listdir(source_dir):
    src_path = os.path.join(source_dir, item)
    dst_path = os.path.join(target_dir, item)

    if item == "drive":
        continue  # ⚠️ Ignorer le dossier Google Drive

    try:
        if os.path.isdir(src_path):
            shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
        else:
            shutil.copy2(src_path, dst_path)
    except Exception as e:
        print(f"Erreur en copiant {item} : {e}")

print(f"✅ Tous les fichiers (sauf /drive) ont été copiés vers : {target_dir}")


✅ Tous les fichiers (sauf /drive) ont été copiés vers : /content/drive/MyDrive/NLP_dernier_modification_09_06_60_20_20


_____________________________________________________________________

### _75_12.5_12.5

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import ast

# Download 'punkt' for word_tokenize (already done, but good practice to keep)
nltk.download('punkt')

# Download the specific 'punkt_tab' resource mentioned in the error
nltk.download('punkt_tab')


# 📌 Fonction principale pour annotation BIO
def annotate_bio(df, output_path):
    with open(output_path, "w", encoding="utf-8") as fout:
        for _, row in df.iterrows():
            abstract = row['cleaned']  # ✅ colonne contenant le texte nettoyé

            # Assurez-vous que la colonne 'cleaned' n'est pas vide ou None
            if pd.isna(abstract) or abstract.strip() == "":
                 continue # Ignore les lignes sans texte nettoyé

            try:
                # Utiliser un ensemble vide si la colonne n'existe pas
                # ou si l'évaluation échoue
                genes = set(ast.literal_eval(row.get('genes', '[]')))
                proteins = set(ast.literal_eval(row.get('proteins', '[]')))
                diseases = set(ast.literal_eval(row.get('diseases', '[]')))
                symptoms = set(ast.literal_eval(row.get('symptoms', '[]')))
            except Exception as e:
                print(f"Erreur parsing à la ligne {row.name} : {e}")
                continue

            # Gérer les cas où abstract pourrait être None ou non-string
            if not isinstance(abstract, str):
                 print(f"Ligne {row.name} a un type d'abstract inattendu: {type(abstract)}")
                 continue


            tokens = word_tokenize(abstract)

            for token in tokens:
                tag = "O"
                token_lower = token.lower()
                token_upper = token.upper()

                # Itérer sur les ensembles d'entités
                if token_upper in genes:
                    tag = "B-GENE"
                elif token_lower in proteins:
                    tag = "B-PROTEIN"
                elif token_lower in diseases:
                    tag = "B-DISEASE"
                elif token_lower in symptoms:
                    tag = "B-SYMPTOM"

                fout.write(f"{token} {tag}\n")
            fout.write("\n")

    print(f"✅ Fichier BIO sauvegardé dans : {output_path}")

# 🔄 Chargement des fichiers
train_df = pd.read_csv("/content/abstracts07_06_train_75_12.5_12.5.csv")
val_df   = pd.read_csv("/content/abstracts07_06_val_75_12.5_12.5.csv")
test_df  = pd.read_csv("/content/abstracts07_06_test_75_12.5_12.5.csv")

# 📝 Application
annotate_bio(train_df, "/content/abstracts07_06_bio_train_75_12.5_12.5.bio")
annotate_bio(val_df, "/content/abstracts07_06_bio_val_75_12.5_12.5.bio")
annotate_bio(test_df, "/content/abstracts07_06_bio_test_75_12.5_12.5.bio")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_train_75_12.5_12.5.bio
✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_val_75_12.5_12.5.bio
✅ Fichier BIO sauvegardé dans : /content/abstracts07_06_bio_test_75_12.5_12.5.bio


In [None]:
#Elle ignore les tokens "O" dans les labels pendant l'entraînement.

from datasets import Dataset
from transformers import AutoTokenizer

# 📌 Modèle utilisé (BioBERT)
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 🏷️ Étiquettes
label_list = ['O', 'B-GENE', 'B-PROTEIN', 'B-DISEASE', 'B-SYMPTOM']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


# 📄 Lire un fichier BIO et l'organiser par phrase
def read_bio_file(filepath):
    sentences = []
    labels = []
    current_tokens = []
    current_labels = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == '':
                if current_tokens:
                    sentences.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []
            else:
                splits = line.strip().split()
                if len(splits) == 2:
                    token, tag = splits
                    current_tokens.append(token)
                    current_labels.append(tag)

        # Ajouter la dernière phrase si non vide
        if current_tokens:
            sentences.append(current_tokens)
            labels.append(current_labels)

    return sentences, labels


# ✂️ Tokenisation + alignement des labels avec gestion du max_length
#    et exclusion des tokens "O" (ignorés avec -100)
def tokenize_and_align(sentences, tags, tokenizer):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors=None
    )

    aligned_labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                tag = label[word_idx]
                label_ids.append(label2id[tag] if tag != "O" else -100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


# 📁 Préparer un Dataset HuggingFace depuis un fichier BIO
def prepare_dataset_from_bio(filepath):
    sents, tags = read_bio_file(filepath)
    tokenized = tokenize_and_align(sents, tags, tokenizer)
    return Dataset.from_dict(tokenized)


# ✅ Créer les datasets avec les bons fichiers
train_dataset = prepare_dataset_from_bio("//content/abstracts07_06_bio_train_75_12.5_12.5.bio")
val_dataset   = prepare_dataset_from_bio("/content/abstracts07_06_bio_val_75_12.5_12.5.bio")
test_dataset  = prepare_dataset_from_bio("/content/abstracts07_06_bio_test_75_12.5_12.5.bio")

# 🔍 Vérification
print("✅ Datasets créés :")
print("Exemple train_dataset :")
print(train_dataset[0])



✅ Datasets créés :
Exemple train_dataset :
{'input_ids': [101, 122, 1619, 1920, 4182, 17881, 1571, 179, 3488, 130, 3081, 128, 3731, 1604, 9465, 1275, 1620, 1559, 188, 7629, 1571, 10973, 5507, 1571, 4925, 1545, 14541, 127, 1679, 2660, 3365, 5838, 6730, 2112, 13035, 14471, 4420, 15819, 1113, 2528, 13791, 1596, 6059, 12818, 3189, 27154, 3622, 178, 15197, 13292, 1766, 172, 122, 1195, 22654, 12210, 1200, 183, 1197, 123, 1126, 2571, 3309, 188, 1116, 123, 2351, 1869, 122, 2417, 190, 13166, 8032, 1596, 1113, 12241, 2853, 190, 13166, 15741, 2755, 16946, 1465, 22572, 1813, 7841, 3052, 2138, 191, 1161, 1366, 1161, 172, 1182, 1571, 1181, 190, 2497, 13836, 6066, 1324, 8916, 123, 2853, 15190, 18766, 4807, 2755, 16946, 1465, 22572, 1813, 7841, 3052, 2138, 191, 1161, 1366, 1161, 3007, 17459, 3209, 3772, 1679, 2660, 3365, 5838, 6730, 8115, 2112, 13035, 14471, 1113, 2528, 13791, 1596, 4420, 15819, 13467, 1231, 25461, 4069, 12818, 3189, 7091, 2200, 4013, 7356, 23972, 1158, 1679, 2660, 3365, 5838, 6730, 1

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, accuracy_score, f1_score

# ✅ Charger le modèle pré-entraîné
model = AutoModelForTokenClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 📊 Métriques d’évaluation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions, output_dict=False)
    }

# 🔧 Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./ner_biobert",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
)

# 🚀 Entraînement avec Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.5564
100,0.122


TrainOutput(global_step=120, training_loss=0.2962733010450999, metrics={'train_runtime': 100.2352, 'train_samples_per_second': 9.577, 'train_steps_per_second': 1.197, 'total_flos': 250851690086400.0, 'train_loss': 0.2962733010450999, 'epoch': 4.0})

In [None]:
# Évaluer le modèle sur le jeu de validation
results = trainer.evaluate()

# Affichage des métriques principales
print("📊 Résultats de l'évaluation :")
print(f"Accuracy : {results['eval_accuracy']:.4f}")
print(f"F1-score : {results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(results['eval_report'])


📊 Résultats de l'évaluation :
Accuracy : 0.9414
F1-score : 0.9414

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       1.00      0.97      0.99       146
        GENE       0.82      0.97      0.89        61
     PROTEIN       0.95      0.90      0.92       116
     SYMPTOM       0.00      0.00      0.00         1

   micro avg       0.94      0.94      0.94       324
   macro avg       0.69      0.71      0.70       324
weighted avg       0.94      0.94      0.94       324



In [None]:
# 📊 Évaluation sur le jeu de test
test_results = trainer.evaluate(eval_dataset=test_dataset)

# 📋 Affichage des métriques
print("📊 Résultats sur le jeu de test :")
print(f"Accuracy : {test_results['eval_accuracy']:.4f}")
print(f"F1-score : {test_results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(test_results['eval_report'])


📊 Résultats sur le jeu de test :
Accuracy : 0.9706
F1-score : 0.9706

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.95      1.00      0.98       105
        GENE       0.98      0.92      0.95        59
     PROTEIN       0.98      0.97      0.98       107
     SYMPTOM       1.00      1.00      1.00         1

   micro avg       0.97      0.97      0.97       272
   macro avg       0.98      0.97      0.98       272
weighted avg       0.97      0.97      0.97       272



In [None]:
import shutil
import os

source_dir = "/content"
target_dir = "/content/drive/MyDrive/NLP_dernier_modification_09_06_75_12.5_12.5"

# Créer le dossier cible s’il n’existe pas
os.makedirs(target_dir, exist_ok=True)

# Parcourir tous les fichiers/dossiers dans /content sauf /content/drive
for item in os.listdir(source_dir):
    src_path = os.path.join(source_dir, item)
    dst_path = os.path.join(target_dir, item)

    if item == "drive":
        continue  # ⚠️ Ignorer le dossier Google Drive

    try:
        if os.path.isdir(src_path):
            shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
        else:
            shutil.copy2(src_path, dst_path)
    except Exception as e:
        print(f"Erreur en copiant {item} : {e}")

print(f"✅ Tous les fichiers (sauf /drive) ont été copiés vers : {target_dir}")


✅ Tous les fichiers (sauf /drive) ont été copiés vers : /content/drive/MyDrive/NLP_dernier_modification_09_06_75_12.5_12.5


________________________________________________________________________

## Test d'abilation Confeguration de Model

In [None]:
# Tests d'Ablation pour NER BioBERT
# Voici plusieurs variantes du modèle pour analyser l'impact de différents composants

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, accuracy_score, f1_score
import torch
import torch.nn as nn

# Configuration de base
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
label_list = ['O', 'B-GENE', 'B-PROTEIN', 'B-DISEASE', 'B-SYMPTOM']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

# ═══════════════════════════════════════════════════════════════════════════════
# 🧪 TEST D'ABLATION 1: Modèle avec couches gelées (frozen layers)
# ═══════════════════════════════════════════════════════════════════════════════

def create_frozen_model(num_frozen_layers=6):
    """Gèle les N premières couches de BioBERT"""
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )

    # Geler les premières couches
    for i, layer in enumerate(model.bert.encoder.layer):
        if i < num_frozen_layers:
            for param in layer.parameters():
                param.requires_grad = False

    print(f"✅ Modèle créé avec {num_frozen_layers} couches gelées")
    return model

# ═══════════════════════════════════════════════════════════════════════════════
# 🧪 TEST D'ABLATION 2: Modèle avec dropout modifié
# ═══════════════════════════════════════════════════════════════════════════════

def create_high_dropout_model(dropout_rate=0.3):
    """Augmente le dropout pour tester la régularisation"""
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        hidden_dropout_prob=dropout_rate,
        attention_probs_dropout_prob=dropout_rate
    )
    print(f"✅ Modèle créé avec dropout = {dropout_rate}")
    return model

# ═══════════════════════════════════════════════════════════════════════════════
# 🧪 TEST D'ABLATION 3: Modèle avec tête de classification simplifiée
# ═══════════════════════════════════════════════════════════════════════════════

class SimplifiedNERModel(nn.Module):
    def __init__(self, base_model_name, num_labels):
        super().__init__()
        self.bert = AutoModelForTokenClassification.from_pretrained(base_model_name).bert
        self.dropout = nn.Dropout(0.1)
        # Tête simplifiée : une seule couche linéaire au lieu de dropout + linear
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.num_labels = num_labels

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return {"loss": loss, "logits": logits}

def create_simplified_model():
    """Crée un modèle avec une tête de classification simplifiée"""
    model = SimplifiedNERModel(model_name, len(label_list))
    print("✅ Modèle créé avec tête de classification simplifiée")
    return model

# ═══════════════════════════════════════════════════════════════════════════════
# 🧪 TEST D'ABLATION 4: Modèle sans pré-entraînement (poids aléatoires)
# ═══════════════════════════════════════════════════════════════════════════════

def create_random_weights_model():
    """Initialise le modèle avec des poids aléatoires (pas de pré-entraînement)"""
    from transformers import BertConfig

    config = BertConfig.from_pretrained(model_name)
    config.num_labels = len(label_list)

    model = AutoModelForTokenClassification.from_config(config)
    model.config.id2label = id2label
    model.config.label2id = label2id

    print("✅ Modèle créé avec poids aléatoires (sans pré-entraînement)")
    return model

# ═══════════════════════════════════════════════════════════════════════════════
# 🧪 TEST D'ABLATION 5: Modèle avec moins de couches d'attention
# ═══════════════════════════════════════════════════════════════════════════════

def create_fewer_layers_model(num_layers=6):
    """Crée un modèle avec moins de couches transformer"""
    from transformers import BertConfig

    config = BertConfig.from_pretrained(model_name)
    config.num_hidden_layers = num_layers  # Réduire de 12 à 6 couches
    config.num_labels = len(label_list)

    model = AutoModelForTokenClassification.from_config(config)
    model.config.id2label = id2label
    model.config.label2id = label2id

    print(f"✅ Modèle créé avec {num_layers} couches au lieu de 12")
    return model

# ═══════════════════════════════════════════════════════════════════════════════
# 🧪 FONCTION D'ENTRAÎNEMENT GÉNÉRALISÉE
# ═══════════════════════════════════════════════════════════════════════════════

def train_ablation_model(model, train_dataset, val_dataset, test_dataset,
                        experiment_name, epochs=4, lr=2e-5):
    """Entraîne et évalue un modèle d'ablation"""

    # Métriques d'évaluation
    def compute_metrics(p):
        preds = np.argmax(p.predictions, axis=2)
        labels = p.label_ids

        true_predictions = [
            [id2label[p] for (p, l) in zip(pred, label) if l != -100]
            for pred, label in zip(preds, labels)
        ]
        true_labels = [
            [id2label[l] for (p, l) in zip(pred, label) if l != -100]
            for pred, label in zip(preds, labels)
        ]

        return {
            "accuracy": accuracy_score(true_labels, true_predictions),
            "f1": f1_score(true_labels, true_predictions),
            "report": classification_report(true_labels, true_predictions, output_dict=False)
        }

    # Arguments d'entraînement
    training_args = TrainingArguments(
        output_dir=f"./ner_biobert_{experiment_name}",
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=lr,
        num_train_epochs=epochs,
        weight_decay=0.01,
        logging_dir=f"./logs_{experiment_name}",
        logging_steps=50,
        save_steps=500,
        eval_steps=500,
        save_total_limit=2,
    )

    # Entraînement
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    print(f"\n🚀 Début de l'entraînement : {experiment_name}")
    trainer.train()

    # Évaluation
    val_results = trainer.evaluate()
    test_results = trainer.evaluate(eval_dataset=test_dataset)

    print(f"\n📊 Résultats {experiment_name}:")
    print(f"Validation - Accuracy: {val_results['eval_accuracy']:.4f}, F1: {val_results['eval_f1']:.4f}")
    print(f"Test - Accuracy: {test_results['eval_accuracy']:.4f}, F1: {test_results['eval_f1']:.4f}")

    return {
        'experiment': experiment_name,
        'val_accuracy': val_results['eval_accuracy'],
        'val_f1': val_results['eval_f1'],
        'test_accuracy': test_results['eval_accuracy'],
        'test_f1': test_results['eval_f1']
    }

# ═══════════════════════════════════════════════════════════════════════════════
# 🧪 EXEMPLE D'UTILISATION - TESTS D'ABLATION
# ═══════════════════════════════════════════════════════════════════════════════

def run_ablation_study(train_dataset, val_dataset, test_dataset):
    """Lance tous les tests d'ablation"""

    results = []

    # Test 1: Modèle de base (référence)
    print("="*60)
    print("🧪 TEST 1: Modèle de base (BioBERT complet)")
    print("="*60)
    base_model = AutoModelForTokenClassification.from_pretrained(
        model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id
    )
    results.append(train_ablation_model(
        base_model, train_dataset, val_dataset, test_dataset, "baseline"
    ))

    # Test 2: Couches gelées
    print("="*60)
    print("🧪 TEST 2: Modèle avec 6 couches gelées")
    print("="*60)
    frozen_model = create_frozen_model(num_frozen_layers=6)
    results.append(train_ablation_model(
        frozen_model, train_dataset, val_dataset, test_dataset, "frozen_6_layers"
    ))

    # Test 3: Dropout élevé
    print("="*60)
    print("🧪 TEST 3: Modèle avec dropout élevé (0.3)")
    print("="*60)
    high_dropout_model = create_high_dropout_model(dropout_rate=0.3)
    results.append(train_ablation_model(
        high_dropout_model, train_dataset, val_dataset, test_dataset, "high_dropout"
    ))

    # Test 4: Tête simplifiée
    print("="*60)
    print("🧪 TEST 4: Modèle avec tête de classification simplifiée")
    print("="*60)
    simplified_model = create_simplified_model()
    results.append(train_ablation_model(
        simplified_model, train_dataset, val_dataset, test_dataset, "simplified_head"
    ))

    # Test 5: Poids aléatoires
    print("="*60)
    print("🧪 TEST 5: Modèle sans pré-entraînement (poids aléatoires)")
    print("="*60)
    random_model = create_random_weights_model()
    results.append(train_ablation_model(
        random_model, train_dataset, val_dataset, test_dataset, "random_weights", epochs=5, lr=5e-5
    ))

    # Résumé des résultats
    print("\n" + "="*80)
    print("📊 RÉSUMÉ DES TESTS D'ABLATION")
    print("="*80)
    print(f"{'Expérience':<20} {'Val Acc':<10} {'Val F1':<10} {'Test Acc':<10} {'Test F1':<10}")
    print("-" * 80)

    for result in results:
        print(f"{result['experiment']:<20} {result['val_accuracy']:<10.4f} {result['val_f1']:<10.4f} "
              f"{result['test_accuracy']:<10.4f} {result['test_f1']:<10.4f}")

    return results


In [None]:
# ═══════════════════════════════════════════════════════════════════════════════
# 🚀 LANCEMENT DES TESTS
# ═══════════════════════════════════════════════════════════════════════════════

# Supposons que vous avez déjà vos datasets chargés
# train_dataset = prepare_dataset_from_bio("path/to/train.bio")
# val_dataset = prepare_dataset_from_bio("path/to/val.bio")
# test_dataset = prepare_dataset_from_bio("path/to/test.bio")

train_dataset = prepare_dataset_from_bio("/content/abstracts07_06_bio_train_60_20_20.bio")
val_dataset   = prepare_dataset_from_bio("/content/abstracts07_06_bio_val_60_20_20.bio")
test_dataset  = prepare_dataset_from_bio("/content/abstracts07_06_bio_test_60_20_20.bio")

# Lancer l'étude d'ablation complète
ablation_results = run_ablation_study(train_dataset, val_dataset, test_dataset)

# OU lancer un test individuel, par exemple :
# model = create_frozen_model(num_frozen_layers=8)
# result = train_ablation_model(model, train_dataset, val_dataset, test_dataset, "frozen_8_layers")

🧪 TEST 1: Modèle de base (BioBERT complet)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



🚀 Début de l'entraînement : baseline


Step,Training Loss
50,0.5511


  _warn_prf(average, modifier, msg_start, len(result))



📊 Résultats baseline:
Validation - Accuracy: 0.9092, F1: 0.9092
Test - Accuracy: 0.9789, F1: 0.9789
🧪 TEST 2: Modèle avec 6 couches gelées


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


✅ Modèle créé avec 6 couches gelées

🚀 Début de l'entraînement : frozen_6_layers


Step,Training Loss
50,0.7001


  _warn_prf(average, modifier, msg_start, len(result))



📊 Résultats frozen_6_layers:
Validation - Accuracy: 0.8866, F1: 0.8866
Test - Accuracy: 0.9604, F1: 0.9604
🧪 TEST 3: Modèle avec dropout élevé (0.3)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


✅ Modèle créé avec dropout = 0.3

🚀 Début de l'entraînement : high_dropout


Step,Training Loss
50,0.8327


  _warn_prf(average, modifier, msg_start, len(result))



📊 Résultats high_dropout:
Validation - Accuracy: 0.8761, F1: 0.8761
Test - Accuracy: 0.9578, F1: 0.9578
🧪 TEST 4: Modèle avec tête de classification simplifiée


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


✅ Modèle créé avec tête de classification simplifiée

🚀 Début de l'entraînement : simplified_head


Step,Training Loss
50,0.6085


  _warn_prf(average, modifier, msg_start, len(result))



📊 Résultats simplified_head:
Validation - Accuracy: 0.8743, F1: 0.8743
Test - Accuracy: 0.9551, F1: 0.9551
🧪 TEST 5: Modèle sans pré-entraînement (poids aléatoires)
✅ Modèle créé avec poids aléatoires (sans pré-entraînement)

🚀 Début de l'entraînement : random_weights


  trainer = Trainer(


Step,Training Loss
50,0.7635
100,0.1875



📊 Résultats random_weights:
Validation - Accuracy: 0.8080, F1: 0.8080
Test - Accuracy: 0.9129, F1: 0.9129

📊 RÉSUMÉ DES TESTS D'ABLATION
Expérience           Val Acc    Val F1     Test Acc   Test F1   
--------------------------------------------------------------------------------
baseline             0.9092     0.9092     0.9789     0.9789    
frozen_6_layers      0.8866     0.8866     0.9604     0.9604    
high_dropout         0.8761     0.8761     0.9578     0.9578    
simplified_head      0.8743     0.8743     0.9551     0.9551    
random_weights       0.8080     0.8080     0.9129     0.9129    


# 2eme modele

In [None]:
!pip install -U transformers




In [None]:
!pip install -U transformers datasets


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [None]:
label_list = ['O', 'B-GENE', 'B-PROTEIN', 'B-DISEASE', 'B-SYMPTOM']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
label_list = ['O', 'B-GENE', 'B-PROTEIN', 'B-DISEASE', 'B-SYMPTOM']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


# 📄 Lire un fichier BIO et l'organiser par phrase
def read_bio_file(filepath):
    sentences = []
    labels = []
    current_tokens = []
    current_labels = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == '':
                if current_tokens:
                    sentences.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []
            else:
                splits = line.strip().split()
                if len(splits) == 2:
                    token, tag = splits
                    current_tokens.append(token)
                    current_labels.append(tag)

        # Ajouter la dernière phrase si non vide
        if current_tokens:
            sentences.append(current_tokens)
            labels.append(current_labels)

    return sentences, labels


# ✂ Tokenisation + alignement des labels avec gestion du max_length
#    et exclusion des tokens "O" (ignorés avec -100)
def tokenize_and_align(sentences, tags, tokenizer):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors=None
    )

    aligned_labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                tag = label[word_idx]
                label_ids.append(label2id[tag] if tag != "O" else -100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


# 📁 Préparer un Dataset HuggingFace depuis un fichier BIO
def prepare_dataset_from_bio(filepath):
    sents, tags = read_bio_file(filepath)
    tokenized = tokenize_and_align(sents, tags, tokenizer)
    return Dataset.from_dict(tokenized)

In [None]:
#train_dataset = prepare_dataset_from_bio("/content/drive/MyDrive/NLP_dernier_modification_07_06/abstracts07_06_bio_train.bio")
#val_dataset   = prepare_dataset_from_bio("/content/drive/MyDrive/NLP_dernier_modification_07_06/abstracts07_06_bio_val.bio")
#test_dataset  = prepare_dataset_from_bio("/content/drive/MyDrive/NLP_dernier_modification_07_06/abstracts07_06_bio_test.bio")

# ✅ Créer les datasets avec les bons fichiers
# ✅ Créer les datasets avec les bons fichiers
# train_dataset = prepare_dataset_from_bio("/content/abstracts07_06_bio_train_60_20_20.bio")
# val_dataset   = prepare_dataset_from_bio("/content/abstracts07_06_bio_val_60_20_20.bio")
# test_dataset  = prepare_dataset_from_bio("/content/abstracts07_06_bio_test_60_20_20.bio")

# ✅ Créer les datasets avec les bons fichiers
train_dataset = prepare_dataset_from_bio("/content/abstracts07_06_bio_train.bio")
val_dataset   = prepare_dataset_from_bio("/content/abstracts07_06_bio_val.bio")
test_dataset  = prepare_dataset_from_bio("/content/abstracts07_06_bio_test.bio")

__________________________________________________________

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, accuracy_score, f1_score

# 📌 Nom du modèle
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"

# Charger le tokenizer et le modèle
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Métriques NER
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions, output_dict=False)
    }

# ✅ Configuration d'entraînement
training_args = TrainingArguments(
    output_dir="./ner_pubmedbert",
    do_train=True,
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 🚀 Entraînement
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.5758
100,0.1509


TrainOutput(global_step=128, training_loss=0.2989308377727866, metrics={'train_runtime': 116.8177, 'train_samples_per_second': 8.766, 'train_steps_per_second': 1.096, 'total_flos': 267575136092160.0, 'train_loss': 0.2989308377727866, 'epoch': 4.0})

_______________________________________________________________________

In [None]:
import transformers
print(transformers.__version__)


4.52.4


In [None]:
# Évaluer le modèle sur le jeu de validation
results = trainer.evaluate()

# Affichage des métriques principales
print("📊 Résultats de l'évaluation :")
print(f"Accuracy : {results['eval_accuracy']:.4f}")
print(f"F1-score : {results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(results['eval_report'])


📊 Résultats de l'évaluation :
Accuracy : 0.9267
F1-score : 0.9267

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.99      0.96      0.98       103
        GENE       0.90      0.86      0.88        63
     PROTEIN       0.88      0.93      0.91       105
     SYMPTOM       1.00      1.00      1.00         2

   micro avg       0.93      0.93      0.93       273
   macro avg       0.94      0.94      0.94       273
weighted avg       0.93      0.93      0.93       273



In [None]:
# 📊 Évaluation sur le jeu de test
test_results = trainer.evaluate(eval_dataset=test_dataset)

# 📋 Affichage des métriques
print("📊 Résultats sur le jeu de test :")
print(f"Accuracy : {test_results['eval_accuracy']:.4f}")
print(f"F1-score : {test_results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(test_results['eval_report'])


📊 Résultats sur le jeu de test :
Accuracy : 0.9585
F1-score : 0.9585

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.99      0.99      0.99       120
        GENE       0.93      0.95      0.94        66
     PROTEIN       0.94      0.92      0.93        78
     SYMPTOM       0.00      0.00      0.00         1

   micro avg       0.96      0.96      0.96       265
   macro avg       0.71      0.72      0.72       265
weighted avg       0.96      0.96      0.96       265



  _warn_prf(average, modifier, msg_start, len(result))


__________________________________________________________________

# **3 eme modele**

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, accuracy_score, f1_score

# 📌 Nom du modèle SapBERT
model_name = "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"

# Charger le tokenizer et le modèle
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 🔍 Métriques de NER
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions, output_dict=False)
    }

# ✅ Configuration d'entraînement
training_args = TrainingArguments(
    output_dir="./ner_sapbert",
    do_train=True,
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
)

# ⚙️ Trainer HuggingFace
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 🚀 Lancer l'entraînement
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at cambridgeltl/SapBERT-from-PubMedBERT-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,1.1332
100,0.6969


TrainOutput(global_step=128, training_loss=0.8190451115369797, metrics={'train_runtime': 106.9887, 'train_samples_per_second': 9.571, 'train_steps_per_second': 1.196, 'total_flos': 267575136092160.0, 'train_loss': 0.8190451115369797, 'epoch': 4.0})

In [None]:
# Évaluer le modèle sur le jeu de validation
results = trainer.evaluate()

# Affichage des métriques principales
print("📊 Résultats de l'évaluation :")
print(f"Accuracy : {results['eval_accuracy']:.4f}")
print(f"F1-score : {results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(results['eval_report'])


📊 Résultats de l'évaluation :
Accuracy : 0.7889
F1-score : 0.7889

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.89      0.91      0.90       127
        GENE       0.86      0.48      0.62        67
     PROTEIN       0.63      0.92      0.75        71
     SYMPTOM       0.00      0.00      0.00         5

   micro avg       0.79      0.79      0.79       270
   macro avg       0.60      0.58      0.57       270
weighted avg       0.80      0.79      0.77       270



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# 📊 Évaluation sur le jeu de test
test_results = trainer.evaluate(eval_dataset=test_dataset)

# 📋 Affichage des métriques
print("📊 Résultats sur le jeu de test :")
print(f"Accuracy : {test_results['eval_accuracy']:.4f}")
print(f"F1-score : {test_results['eval_f1']:.4f}")
print("\n📄 Rapport détaillé :")
print(test_results['eval_report'])


📊 Résultats sur le jeu de test :
Accuracy : 0.7438
F1-score : 0.7438

📄 Rapport détaillé :
              precision    recall  f1-score   support

     DISEASE       0.82      0.96      0.89       121
        GENE       0.58      0.32      0.41        60
     PROTEIN       0.68      0.82      0.74        87
     SYMPTOM       1.00      0.23      0.38        13

   micro avg       0.74      0.74      0.74       281
   macro avg       0.77      0.58      0.60       281
weighted avg       0.73      0.74      0.72       281

