In [1]:
from transformers import CamembertTokenizer, CamembertForTokenClassification
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset
import evaluate
import numpy as np
import torch

print("Imports OK !")

Imports OK !


In [2]:
from transformers import CamembertTokenizerFast  # Fast version !

# Charger le dataset
dataset = load_dataset("Jean-Baptiste/wikiner_fr")

# Charger le tokenizer FAST (pas CamembertTokenizer)
tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")

# Labels
label_names = ['O', 'LOC', 'PER', 'MISC', 'ORG']
num_labels = len(label_names)

print(f"Dataset chargé : {len(dataset['train'])} exemples")
print(f"Labels : {label_names}")

Dataset chargé : 120682 exemples
Labels : ['O', 'LOC', 'PER', 'MISC', 'ORG']


In [3]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignorer padding
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Ignorer sous-tokens
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Appliquer le prétraitement
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
print("Prétraitement terminé !")

Map:   0%|          | 0/13410 [00:00<?, ? examples/s]

Prétraitement terminé !


In [4]:
# Charger le modèle CamemBERT pour NER
model = CamembertForTokenClassification.from_pretrained(
    "camembert-base",
    num_labels=num_labels
)

# Déplacer sur GPU
if torch.cuda.is_available():
    model = model.to("cuda")
    print(f"✓ Modèle chargé sur GPU : {torch.cuda.get_device_name(0)}")
else:
    print("⚠ Pas de GPU, utilisation CPU")

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Modèle chargé sur GPU : Quadro RTX 6000


In [5]:
# Métrique
seqeval = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    
    true_labels = []
    true_preds = []
    
    for pred, label in zip(predictions, labels):
        true_label = []
        true_pred = []
        for p, l in zip(pred, label):
            if l != -100:
                true_label.append(label_names[l])
                true_pred.append(label_names[p])
        true_labels.append(true_label)
        true_preds.append(true_pred)
    
    results = seqeval.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./camembert-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=400,
    per_device_eval_batch_size=400,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
)

print("Configuration prête !")

Configuration prête !


In [6]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Lancer l'entraînement
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.198567,0.849122,0.871603,0.860216,0.9822
2,0.329100,0.13633,0.874978,0.886351,0.880628,0.984739
3,0.329100,0.124036,0.875308,0.886263,0.880752,0.984621




TrainOutput(global_step=906, training_loss=0.24292834705074892, metrics={'train_runtime': 848.6293, 'train_samples_per_second': 426.624, 'train_steps_per_second': 1.068, 'total_flos': 2.365100286172416e+16, 'train_loss': 0.24292834705074892, 'epoch': 3.0})

In [7]:
trainer.save_model("./camembert-ner-final")
tokenizer.save_pretrained("./camembert-ner-final")
print("✓ Modèle sauvegardé dans ./camembert-ner-final")

✓ Modèle sauvegardé dans ./camembert-ner-final
