In [1]:
from transformers import BertTokenizerFast, BertForTokenClassification
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset
import evaluate
import numpy as np
import torch

print("Imports OK !")

Imports OK !


In [2]:
# Charger mBERT
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")

# Charger le dataset NER
dataset = load_dataset("Jean-Baptiste/wikiner_fr")

# Labels
label_names = ['O', 'LOC', 'PER', 'MISC', 'ORG']
num_labels = len(label_names)

# Charger le modèle
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=num_labels)

print(f"Dataset : {len(dataset['train'])} train, {len(dataset['test'])} test")
print(f"Labels : {label_names}")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dataset : 120682 train, 13410 test
Labels : ['O', 'LOC', 'PER', 'MISC', 'ORG']


In [3]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
print("Prétraitement terminé !")

Prétraitement terminé !


In [4]:
# Déplacer sur GPU
if torch.cuda.is_available():
    model = model.to("cuda")
    print(f"✓ Modèle sur GPU : {torch.cuda.get_device_name(0)}")

# Métrique
seqeval = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    
    true_labels = []
    true_preds = []
    
    for pred, label in zip(predictions, labels):
        true_label = []
        true_pred = []
        for p, l in zip(pred, label):
            if l != -100:
                true_label.append(label_names[l])
                true_pred.append(label_names[p])
        true_labels.append(true_label)
        true_preds.append(true_pred)
    
    results = seqeval.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./mbert-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Configuration prête !")

✓ Modèle sur GPU : Quadro RTX 6000
Configuration prête !


  trainer = Trainer(


In [5]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0749,0.042528,0.889154,0.897908,0.89351,0.986008
2,0.0323,0.038093,0.901078,0.911211,0.906116,0.988055
3,0.0244,0.038644,0.902813,0.910161,0.906472,0.988157




TrainOutput(global_step=2829, training_loss=0.038441937649869466, metrics={'train_runtime': 924.5964, 'train_samples_per_second': 391.572, 'train_steps_per_second': 3.06, 'total_flos': 2.365100286172416e+16, 'train_loss': 0.038441937649869466, 'epoch': 3.0})

In [6]:
results = trainer.evaluate(tokenized_dataset["test"])

print("\n" + "="*50)
print("RÉSULTATS mBERT - NER TEST SET")
print("="*50)
print(f"F1-Score mBERT     : {results['eval_f1']*100:.2f}%")
print(f"F1-Score CamemBERT : 88.07%")
print("="*50)
print("\nCOMPARAISON COMPLÈTE :")
print("="*50)
print(f"{'Tâche':<10} | {'CamemBERT':<12} | {'mBERT':<12}")
print("-"*40)
print(f"{'XNLI':<10} | {'81.78%':<12} | {'77.54%':<12}")
print(f"{'NER':<10} | {'88.07%':<12} | {results['eval_f1']*100:.2f}%")
print("="*50)




RÉSULTATS mBERT - NER TEST SET
F1-Score mBERT     : 90.65%
F1-Score CamemBERT : 88.07%

COMPARAISON COMPLÈTE :
Tâche      | CamemBERT    | mBERT       
----------------------------------------
XNLI       | 81.78%       | 77.54%      
NER        | 88.07%       | 90.65%
