In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate
import numpy as np
import torch

print("Imports OK !")

Imports OK !


In [2]:
# Charger mBERT (multilingual BERT)
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)

# Charger XNLI français
dataset = load_dataset("xnli", "fr")

print(f"Modèle : bert-base-multilingual-cased")
print(f"Dataset : {len(dataset['train'])} train, {len(dataset['test'])} test")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Modèle : bert-base-multilingual-cased
Dataset : 392702 train, 5010 test


In [3]:
def preprocess_function(examples):
    return tokenizer(
        examples["premise"],
        examples["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["premise", "hypothesis"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

print("Prétraitement terminé !")

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/5010 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Prétraitement terminé !


In [4]:
# Déplacer sur GPU
if torch.cuda.is_available():
    model = model.to("cuda")
    print(f"✓ Modèle sur GPU : {torch.cuda.get_device_name(0)}")

# Métrique
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./mbert-xnli",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Configuration prête !")

✓ Modèle sur GPU : Quadro RTX 6000
Configuration prête !


  trainer = Trainer(


In [5]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6053,0.591594,0.756627
2,0.5264,0.576168,0.776707
3,0.4494,0.58113,0.7751
4,0.3936,0.584432,0.783133
5,0.3456,0.623443,0.782329


TrainOutput(global_step=15340, training_loss=0.4774251796120615, metrics={'train_runtime': 4102.1163, 'train_samples_per_second': 478.658, 'train_steps_per_second': 3.74, 'total_flos': 1.2915645671077632e+17, 'train_loss': 0.4774251796120615, 'epoch': 5.0})

In [6]:
results = trainer.evaluate(tokenized_dataset["test"])

print("\n" + "="*50)
print("RÉSULTATS mBERT - XNLI TEST SET")
print("="*50)
print(f"Accuracy mBERT    : {results['eval_accuracy']*100:.2f}%")
print(f"Accuracy CamemBERT: 81.78%")
print(f"Accuracy Article  : 76.9% (mBERT)")
print("="*50)


RÉSULTATS mBERT - XNLI TEST SET
Accuracy mBERT    : 77.54%
Accuracy CamemBERT: 81.78%
Accuracy Article  : 76.9% (mBERT)


In [7]:
trainer.save_model("./mbert-xnli-final")
tokenizer.save_pretrained("./mbert-xnli-final")
print("✓ Modèle sauvegardé !")

✓ Modèle sauvegardé !
