In [2]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from datasets import load_dataset

In [3]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate
import numpy as np

In [4]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Charger le mod√®le (3 labels : entailment, neutral, contradiction)
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=3)

print("Tokenizer et mod√®le charg√©s !")

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer et mod√®le charg√©s !


In [5]:
import spacy
import json

# 1. Charger le mod√®le spaCy pour le fran√ßais
nlp = spacy.load("fr_core_news_md")

# 2. D√©finir la fonction d'annotation
def annotate_pos(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    pos_tags = [token.pos_ for token in doc]
    return {"tokens": tokens, "pos_tags": pos_tags}

In [17]:
def process_text_file(input_file, output_file):
    dataset = []

    with open(input_file, "r", encoding="utf-8") as f:
        lines = (line.strip() for line in f if line.strip())

        for doc in nlp.pipe(lines, batch_size=2000):
            dataset.append({
                "tokens": [token.text for token in doc],
                "pos_tags": [token.pos_ for token in doc]
            })

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=2)

    print(f"Dataset sauvegard√© dans {output_file} avec {len(dataset)} exemples.")


In [18]:
#input_file = "../dataset_g5/fr_part_1.txt"  # Remplace par le chemin de ton fichier
#output_file = "../Youdas/postagging_fr_part1.json"  # Fichier de sortie
#process_text_file(input_file, output_file)

Dataset sauvegard√© dans ../Youdas/postagging_fr_part1.json avec 6689782 exemples.


In [None]:
from datasets import Dataset
import json

# Charger le JSON
with open("../Youdas/postagging_fr_part1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Cr√©er un Dataset Hugging Face
dataset = Dataset.from_list(data)

# Afficher un exemple
print(dataset[0])

# Splitter en train (80%) et temp (20%)
split1 = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split1['train']
temp_dataset = split1['test']

# Splitter temp en eval (50% = 10% du total) et test (50% = 10% du total)
split2 = temp_dataset.train_test_split(test_size=0.5, seed=42)
eval_dataset = split2['train']
test_dataset = split2['test']

print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}, Test: {len(test_dataset)}")


In [9]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# R√©cup√©rer tous les tags uniques
all_tags = set(tag for example in train_dataset["pos_tags"] for tag in example)
tag2id = {tag: id for id, tag in enumerate(sorted(all_tags))}
id2tag = {id: tag for tag, id in tag2id.items()}
num_labels = len(tag2id)

print(f"Nombre de tags uniques : {num_labels}")
print("Exemple de mapping :", {k: tag2id[k] for k in list(tag2id)[:5]})


Nombre de tags uniques : 17
Exemple de mapping : {'ADJ': 0, 'ADP': 1, 'ADV': 2, 'AUX': 3, 'CCONJ': 4}


In [10]:
from transformers import CamembertTokenizerFast
tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["pos_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:  # Token sp√©cial ([CLS], [SEP], [PAD])
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Nouveau mot
                label_ids.append(tag2id[label[word_idx]])
            else:  # Sous-mot du m√™me mot
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Appliquer √† tout le dataset
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval = eval_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test = test_dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [11]:
from transformers import CamembertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

# Charger le mod√®le
model = CamembertForTokenClassification.from_pretrained("camembert-base", num_labels=num_labels)

# D√©finir les arguments d'entra√Ænement
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

# Cr√©er le data collator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)

# Cr√©er le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,  # <-- Ici
)

# Lancer l'entra√Ænement
trainer.train()


Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,2.7597,2.690702
2,2.6348,2.607784
3,2.5679,2.579136


TrainOutput(global_step=30, training_loss=2.654128901163737, metrics={'train_runtime': 22.7121, 'train_samples_per_second': 20.209, 'train_steps_per_second': 1.321, 'total_flos': 59501563109850.0, 'train_loss': 2.654128901163737, 'epoch': 3.0})

In [12]:
import torch
import numpy as np
from transformers import (
    CamembertTokenizerFast,
    CamembertForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from sklearn.metrics import accuracy_score

# ------------------------------------------------------------
# 0. S√©lection automatique GPU / CPU
# ------------------------------------------------------------
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(" GPU d√©tect√© :", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print(" Aucun GPU d√©tect√© ‚Üí utilisation du CPU")

# ------------------------------------------------------------
# 1. Charger le tokenizer
# ------------------------------------------------------------
tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")

# ------------------------------------------------------------
# 2. Charger le mod√®le
# ------------------------------------------------------------
model = CamembertForTokenClassification.from_pretrained(
    "camembert-base",
    num_labels=num_labels
).to(device)

print(f"‚úì Mod√®le d√©plac√© sur : {device}")

# ------------------------------------------------------------
# 3. Fonction compute_metrics pour accuracy
# ------------------------------------------------------------
def compute_metrics(pred):
    predictions, labels = pred
    preds = np.argmax(predictions, axis=-1)

    mask = labels != -100
    preds = preds[mask]
    labels = labels[mask]

    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

# ------------------------------------------------------------
# 4. Data collator
# ------------------------------------------------------------
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)

# ------------------------------------------------------------
# 5. Arguments d'entra√Ænement
# ------------------------------------------------------------
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",      # <- eval chaque epoch
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,

    fp16=torch.cuda.is_available(),   # <- GPU auto
    report_to="none",

    load_best_model_at_end=True,      # <- charge le mod√®le avec meilleure accuracy
    metric_for_best_model="accuracy", # <- choisir accuracy
    greater_is_better=True,           # <- plus grand = meilleur
)

# ------------------------------------------------------------
# 6. Trainer
# ------------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ------------------------------------------------------------
# 7. Entra√Ænement
# ------------------------------------------------------------
print(f"D√©but de l'entra√Ænement sur : {device}")
trainer.train()


‚úì GPU d√©tect√© : Quadro RTX 6000


Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úì Mod√®le d√©plac√© sur : cuda
üöÄ D√©but de l'entra√Ænement sur : cuda


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,2.7497,2.679142,0.297747
2,2.6221,2.597438,0.399608
3,2.5583,2.568077,0.415279


TrainOutput(global_step=30, training_loss=2.643375587463379, metrics={'train_runtime': 18.4299, 'train_samples_per_second': 24.905, 'train_steps_per_second': 1.628, 'total_flos': 59501563109850.0, 'train_loss': 2.643375587463379, 'epoch': 3.0})

In [None]:
from seqeval.metrics import classification_report

predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=2)

# Convertir les IDs en tags
pred_labels = [[id2tag[p] for p in pred if p != -100] for pred in preds]
true_labels = [[id2tag[l] for l in label if l != -100] for label in tokenized_test["labels"]]

print(classification_report(true_labels, pred_labels))
